Skip to content

Commit

Permalink
feat: deprecate maintain_order in Expr.unique and `LazyFrame.tail…
Browse files Browse the repository at this point in the history
…` (but keep around in `stable.v1`) (#1839)
  • Loading branch information
MarcoGorelli authored Jan 20, 2025
1 parent b26358b commit 7a9560b
Show file tree
Hide file tree
Showing 13 changed files with 87 additions and 115 deletions.
4 changes: 2 additions & 2 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,8 @@ def is_first_distinct(self: Self) -> Self:
def is_last_distinct(self: Self) -> Self:
return reuse_series_implementation(self, "is_last_distinct")

def unique(self: Self, *, maintain_order: bool) -> Self:
return reuse_series_implementation(self, "unique", maintain_order=maintain_order)
def unique(self: Self) -> Self:
return reuse_series_implementation(self, "unique", maintain_order=False)

def replace_strict(
self: Self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
Expand Down
4 changes: 1 addition & 3 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,9 +683,7 @@ def is_sorted(self: Self, *, descending: bool) -> bool:
return maybe_extract_py_scalar(result, return_py_scalar=True) # type: ignore[no-any-return]

def unique(self: Self, *, maintain_order: bool) -> ArrowSeries:
# The param `maintain_order` is only here for compatibility with the Polars API
# and has no effect on the output.

# TODO(marco): `pc.unique` seems to always maintain order, is that guaranteed?
return self._from_native_series(pc.unique(self._native_series))

def replace_strict(
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,11 @@ def group_by(self, *by: str, drop_null_keys: bool) -> DaskLazyGroupBy:

return DaskLazyGroupBy(self, list(by), drop_null_keys=drop_null_keys)

def tail(self: Self, n: int) -> Self:
def tail(self: Self, n: int) -> Self: # pragma: no cover
native_frame = self._native_frame
n_partitions = native_frame.npartitions

if n_partitions == 1: # pragma: no cover
if n_partitions == 1:
return self._from_native_frame(self._native_frame.tail(n=n, compute=False))
else:
msg = "`LazyFrame.tail` is not supported for Dask backend with multiple partitions."
Expand Down
3 changes: 1 addition & 2 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,8 +447,7 @@ def round(self, decimals: int) -> Self:
returns_scalar=self._returns_scalar,
)

def unique(self, *, maintain_order: bool) -> Self:
# TODO(marco): maintain_order has no effect and will be deprecated
def unique(self) -> Self:
return self._from_call(
lambda _input: _input.unique(),
"unique",
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,8 @@ def abs(self) -> Self:
def cum_sum(self: Self, *, reverse: bool) -> Self:
return reuse_series_implementation(self, "cum_sum", reverse=reverse)

def unique(self, *, maintain_order: bool = False) -> Self:
return reuse_series_implementation(self, "unique", maintain_order=maintain_order)
def unique(self) -> Self:
return reuse_series_implementation(self, "unique", maintain_order=False)

def diff(self) -> Self:
return reuse_series_implementation(self, "diff")
Expand Down
8 changes: 4 additions & 4 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,9 +735,9 @@ def cum_sum(self: Self, *, reverse: bool) -> Self:
)
return self._from_native_series(result)

def unique(self, *, maintain_order: bool = False) -> PandasLikeSeries:
# The param `maintain_order` is only here for compatibility with the Polars API
# and has no effect on the output.
def unique(self, *, maintain_order: bool) -> PandasLikeSeries:
# pandas always maintains order, as per its docstring:
# "Uniques are returned in order of appearance" # noqa: ERA001
return self._from_native_series(
self._native_series.__class__(
self._native_series.unique(), name=self._native_series.name
Expand Down Expand Up @@ -779,7 +779,7 @@ def replace_strict(
if result.is_null().sum() != self.is_null().sum():
msg = (
"replace_strict did not replace all non-null values.\n\n"
f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique().to_list()}"
f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}"
)
raise ValueError(msg)
return result
Expand Down
51 changes: 7 additions & 44 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4312,7 +4312,7 @@ def rename(self, mapping: dict[str, str]) -> Self:
return super().rename(mapping)

def head(self, n: int = 5) -> Self:
r"""Get the first `n` rows.
r"""Get `n` rows.
Arguments:
n: Number of rows to return.
Expand Down Expand Up @@ -4360,56 +4360,19 @@ def head(self, n: int = 5) -> Self:
"""
return super().head(n)

def tail(self, n: int = 5) -> Self:
def tail(self, n: int = 5) -> Self: # pragma: no cover
r"""Get the last `n` rows.
!!! warning
`LazyFrame.tail` is deprecated and will be removed in a future version.
Note: this will remain available in `narwhals.stable.v1`.
See [stable api](../backcompat.md/) for more information.
Arguments:
n: Number of rows to return.
Returns:
A subset of the LazyFrame of shape (n, n_columns).
Notes:
`LazyFrame.tail` is not supported for the Dask backend with multiple
partitions.
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": [1, 2, 3, 4, 5, 6],
... "b": [7, 8, 9, 10, 11, 12],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=1)
Let's define a dataframe-agnostic function that gets the last 3 rows.
>>> def agnostic_tail(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.tail(3).collect().to_native()
We can then pass any supported library such as Polars or Dask to `agnostic_tail`:
>>> agnostic_tail(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 4 ┆ 10 │
│ 5 ┆ 11 │
│ 6 ┆ 12 │
└─────┴─────┘
>>> agnostic_tail(lf_dask)
a b
3 4 10
4 5 11
5 6 12
"""
return super().tail(n)

Expand Down
29 changes: 9 additions & 20 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1503,14 +1503,9 @@ def n_unique(self) -> Self:
aggregates=True,
)

def unique(self, *, maintain_order: bool = False) -> Self:
def unique(self) -> Self:
"""Return unique values of this expression.
Arguments:
maintain_order: Keep the same order as the original expression. This may be more
expensive to compute. Settings this to `True` blocks the possibility
to run on the streaming engine for Polars.
Returns:
A new expression.
Expand All @@ -1530,41 +1525,35 @@ def unique(self, *, maintain_order: bool = False) -> Self:
>>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.select(nw.col("a", "b").unique(maintain_order=True)).to_native()
... return df.select(nw.col("a", "b").unique().sum()).to_native()
We can then pass any supported library such as pandas, Polars, or
PyArrow to `agnostic_unique`:
>>> agnostic_unique(df_pd)
a b
0 1 2
1 3 4
2 5 6
a b
0 9 12
>>> agnostic_unique(df_pl)
shape: (3, 2)
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 2 │
│ 3 ┆ 4 │
│ 5 ┆ 6 │
│ 9 ┆ 12 │
└─────┴─────┘
>>> agnostic_unique(df_pa)
pyarrow.Table
a: int64
b: int64
----
a: [[1,3,5]]
b: [[2,4,6]]
a: [[9]]
b: [[12]]
"""
return self.__class__(
lambda plx: self._to_compliant_expr(plx).unique(
maintain_order=maintain_order
),
lambda plx: self._to_compliant_expr(plx).unique(),
self._is_order_dependent,
changes_length=True,
aggregates=self._aggregates,
Expand Down
36 changes: 36 additions & 0 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from narwhals.typing import IntoSeriesT
from narwhals.utils import Implementation
from narwhals.utils import Version
from narwhals.utils import find_stacklevel
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_ordered_categorical
from narwhals.utils import maybe_align_index
Expand Down Expand Up @@ -280,6 +281,17 @@ def _l1_norm(self: Self) -> Self:
"""
return self.select(all()._l1_norm())

def tail(self, n: int = 5) -> Self: # pragma: no cover
r"""Get the last `n` rows.
Arguments:
n: Number of rows to return.
Returns:
A subset of the LazyFrame of shape (n, n_columns).
"""
return super().tail(n)


class Series(NwSeries[Any]):
"""Narwhals Series, backed by a native series.
Expand Down Expand Up @@ -911,6 +923,30 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self:
aggregates=self._aggregates,
)

def unique(self, *, maintain_order: bool | None = None) -> Self:
"""Return unique values of this expression.
Arguments:
maintain_order: Keep the same order as the original expression.
This is deprecated and will be removed in a future version,
but will still be kept around in `narwhals.stable.v1`.
Returns:
A new expression.
"""
if maintain_order is not None:
msg = (
"`maintain_order` has no effect and is only kept around for backwards-compatibility. "
"You can safely remove this argument."
)
warn(message=msg, category=UserWarning, stacklevel=find_stacklevel())
return self.__class__(
lambda plx: self._to_compliant_expr(plx).unique(),
self._is_order_dependent,
changes_length=True,
aggregates=self._aggregates,
)

def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self:
"""Sort this column. Place null values first.
Expand Down
6 changes: 4 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
*iter(LAZY_CONSTRUCTORS.keys()),
]
selected_constructors = [
x for x in selected_constructors if x not in GPU_CONSTRUCTORS
x
for x in selected_constructors
if x not in GPU_CONSTRUCTORS and x not in "modin" # too slow
]
else: # pragma: no cover
selected_constructors = metafunc.config.getoption("constructors").split(",")
Expand Down Expand Up @@ -242,7 +244,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
if (
any(
x in str(metafunc.module)
for x in ("list", "unpivot", "from_dict", "from_numpy", "tail")
for x in ("list", "unpivot", "from_dict", "from_numpy")
)
and LAZY_CONSTRUCTORS["duckdb"] in constructors
):
Expand Down
8 changes: 8 additions & 0 deletions tests/expr_and_series/unique_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

import narwhals as nw
import narwhals.stable.v1 as nw_v1
from narwhals.exceptions import LengthChangingExprError
from tests.utils import Constructor
from tests.utils import ConstructorEager
Expand Down Expand Up @@ -49,3 +50,10 @@ def test_unique_series(constructor_eager: ConstructorEager) -> None:
result = series.unique(maintain_order=True)
expected = {"a": ["x", "y"]}
assert_equal_data({"a": result}, expected)

series = nw.from_native(constructor_eager(data), eager_only=True)["a"]
# this shouldn't warn
series.to_frame().select(nw_v1.col("a").unique().sum())
with pytest.warns(UserWarning):
# this warns that maintain_order has no effect
series.to_frame().select(nw_v1.col("a").unique(maintain_order=False).sum())
42 changes: 8 additions & 34 deletions tests/frame/tail_test.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,16 @@
from __future__ import annotations

from contextlib import nullcontext as does_not_raise

import pytest

import narwhals.stable.v1 as nw
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data


def test_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None:
if "pyspark" in str(constructor):
request.applymarker(pytest.mark.xfail)

def test_tail(constructor_eager: ConstructorEager) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9]}

df_raw = constructor(data)
df = nw.from_native(df_raw).lazy()

context = (
pytest.raises(
NotImplementedError,
match="`LazyFrame.tail` is not supported for Dask backend with multiple partitions.",
)
if "dask_lazy_p2" in str(constructor)
else does_not_raise()
)

with context:
result = df.tail(2)
assert_equal_data(result, expected)

result = df.collect().tail(2) # type: ignore[assignment]
assert_equal_data(result, expected)

result = df.collect().tail(-1) # type: ignore[assignment]
assert_equal_data(result, expected)

result = df.collect().select(nw.col("a").tail(2)) # type: ignore[assignment]
assert_equal_data(result, {"a": expected["a"]})
df_raw = constructor_eager(data)
df = nw.from_native(df_raw)
result = df.tail(2)
assert_equal_data(result, expected)
result = df.tail(-1)
assert_equal_data(result, expected)
3 changes: 3 additions & 0 deletions tests/stable_api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ def test_lazyframe_docstrings() -> None:
if item in ("schema", "columns"):
# to avoid performance warning
continue
if item in ("tail",):
# deprecated
continue
assert remove_docstring_examples(
getattr(stable_df, item).__doc__.replace(
"import narwhals.stable.v1 as nw", "import narwhals as nw"
Expand Down

0 comments on commit 7a9560b

Please sign in to comment.