Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for Series|Expr.skew method #1173

Merged
merged 25 commits into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
90d9742
Implement skew for Arrow, Pandas-like and Polars
CarloLepelaars Oct 14, 2024
c82fec1
Fix doctests
CarloLepelaars Oct 14, 2024
e118e4d
Remove skew in namespace. Remove n > 3 requirement. Fix expr doc
CarloLepelaars Oct 14, 2024
2530f81
Use biases population skewness
CarloLepelaars Oct 14, 2024
fc37529
Add pyarrow example for skew Expr
CarloLepelaars Oct 15, 2024
be2f503
Merge branch 'main' into feat/skew
CarloLepelaars Oct 15, 2024
02fdb4c
Fix: Add a_skew to schema
CarloLepelaars Oct 15, 2024
895be9c
Use native operation for PandasLikeSeries skew. Dask skew expr
CarloLepelaars Oct 17, 2024
a3b71bc
Use native pyarrow operations for skew
CarloLepelaars Oct 17, 2024
9ed06d7
Merge branch 'main' into feat/skew
CarloLepelaars Oct 17, 2024
4ff077d
Simplify arrow skew. non-trivial example for series.skew.
CarloLepelaars Oct 18, 2024
11efd49
unary_test with nan data. 2 element and 1 element unary tests
CarloLepelaars Oct 18, 2024
26a64f8
Fix doctest for Series skew
CarloLepelaars Oct 18, 2024
2014036
Make skew nan policy consistent with Polars
CarloLepelaars Oct 23, 2024
aaada24
Merge branch 'main' into feat/skew
FBruzzesi Oct 29, 2024
3e7eeab
merge main
FBruzzesi Oct 29, 2024
7f6fe07
merge main and add test for coverage
FBruzzesi Nov 12, 2024
2f2912c
Merge branch 'main' into feat/skew
FBruzzesi Nov 13, 2024
082664f
match RuntimeWarning for dask only
FBruzzesi Nov 13, 2024
56299a3
Merge remote-tracking branch 'upstream/main' into feat/skew
MarcoGorelli Nov 23, 2024
2dac3f3
stay pyarrow-native longer
MarcoGorelli Nov 23, 2024
0d3b6ec
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 23, 2024
7f91b19
fix mistake
MarcoGorelli Nov 23, 2024
3399530
Merge branch 'feat/skew' of github.com:CarloLepelaars/narwhals into f…
MarcoGorelli Nov 23, 2024
87f71d7
doctest
MarcoGorelli Nov 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
- sample
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
- shape
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ def n_unique(self) -> Self:
def std(self, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self: Self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def cast(self, dtype: DType) -> Self:
return reuse_series_implementation(self, "cast", dtype)

Expand Down
22 changes: 21 additions & 1 deletion narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,31 @@ def shift(self, n: int) -> Self:
result = ca
return self._from_native_series(result)

def std(self, ddof: int = 1) -> int:
def std(self, ddof: int = 1) -> float:
import pyarrow.compute as pc # ignore-banned-import()

return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return]

def skew(self: Self) -> float | None:
import pyarrow.compute as pc # ignore-banned-import()

ser = self._native_series
ser_not_null = pc.drop_null(ser)
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
elif len(ser_not_null) == 2:
return 0.0
else:
m = pc.subtract(ser_not_null, pc.mean(ser_not_null))
m2 = pc.mean(pc.power(m, 2))
m3 = pc.mean(pc.power(m, 3))
m2_py = m2.as_py()
m3_py = m3.as_py()
g1 = float(m3_py) / (float(m2_py) ** 1.5) if float(m2_py) != 0 else 0
return float(g1) # Biased population skewness

def count(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down
7 changes: 7 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ def std(self, ddof: int = 1) -> Self:
returns_scalar=True,
)

def skew(self: Self) -> Self:
return self._from_call(
lambda _input: _input.skew(),
"skew",
returns_scalar=True,
)
Comment on lines +436 to +441
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of dask, the behavior is not 100% consistent with polars for length 0, 1, 2.
Honestly, I am ok with that. The majority of use cases, especially if distributed data is needed should not involve those sizes to begin with


def shift(self, n: int) -> Self:
return self._from_call(
lambda _input, n: _input.shift(n),
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ def median(self) -> Self:
def std(self, *, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self: Self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def any(self) -> Self:
return reuse_series_implementation(self, "any", returns_scalar=True)

Expand Down
24 changes: 18 additions & 6 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,13 +435,25 @@ def median(self) -> Any:
ser = self._native_series
return ser.median()

def std(
self,
*,
ddof: int = 1,
) -> Any:
def std(self: Self, *, ddof: int = 1) -> float:
ser = self._native_series
return ser.std(ddof=ddof)
return ser.std(ddof=ddof) # type: ignore[no-any-return]

def skew(self: Self) -> float | None:
ser = self._native_series
ser_not_null = ser.dropna()
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
elif len(ser_not_null) == 2:
return 0.0
else:
m = ser_not_null - ser_not_null.mean()
m2 = (m**2).mean()
m3 = (m**3).mean()
g1 = m3 / (m2**1.5) if m2 != 0 else 0
return float(g1) # Biased population skewness

def len(self) -> Any:
return len(self._native_series)
Expand Down
48 changes: 47 additions & 1 deletion narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def std(self, *, ddof: int = 1) -> Self:
Get standard deviation.

Arguments:
ddof: Delta Degrees of Freedom: the divisor used in the calculation is N - ddof,
ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements. By default ddof is 1.

Examples:
Expand Down Expand Up @@ -608,6 +608,52 @@ def map_batches(
)
)

def skew(self: Self) -> Self:
"""
Calculate the sample skewness of a column.

Returns:
An expression representing the sample skewness of the column.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pa = pa.Table.from_pandas(df_pd)

Let's define a dataframe-agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.col("a", "b").skew())

We can then pass pandas, Polars, or PyArrow to `func`:

>>> func(df_pd)
a b
0 0.0 1.472427
>>> func(df_pl)
shape: (1, 2)
┌─────┬──────────┐
│ a ┆ b │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═════╪══════════╡
│ 0.0 ┆ 1.472427 │
└─────┴──────────┘
>>> func(df_pa)
pyarrow.Table
a: double
b: double
----
a: [[0]]
b: [[1.4724267269058975]]
"""
return self.__class__(lambda plx: self._call(plx).skew())

def sum(self) -> Expr:
"""
Return the sum value.
Expand Down
36 changes: 36 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,42 @@ def median(self) -> Any:
"""
return self._compliant_series.median()

def skew(self: Self) -> Any:
"""
Calculate the sample skewness of the Series.

Returns:
The sample skewness of the Series.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> s = [1, 1, 2, 10, 100]
>>> s_pd = pd.Series(s)
>>> s_pl = pl.Series(s)
>>> s_pa = pa.array(s)

We define a library agnostic function:

>>> @nw.narwhalify
... def func(s):
... return s.skew()

We can pass any supported library such as Pandas, Polars, or PyArrow to `func`:

>>> func(s_pd)
1.4724267269058975
>>> func(s_pl)
1.4724267269058975

Notes:
The skewness is a measure of the asymmetry of the probability distribution.
A perfectly symmetric distribution has a skewness of 0.
"""
return self._compliant_series.skew()

def count(self) -> Any:
"""
Returns the number of non-null elements in the Series.
Expand Down
29 changes: 29 additions & 0 deletions tests/expr_and_series/skew_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from __future__ import annotations

import pytest

import narwhals.stable.v1 as nw
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = [1, 2, 3, 2, 1]


@pytest.mark.parametrize(
("size", "expected"),
[
(0, None),
(1, float("nan")),
(2, 0.0),
(5, 0.343622),
],
)
def test_skew_series(
constructor_eager: ConstructorEager, size: int, expected: float | None
) -> None:
result = (
nw.from_native(constructor_eager({"a": data}), eager_only=True)
.head(size)["a"]
.skew()
)
assert_equal_data({"a": [result]}, {"a": [expected]})
Loading
Loading