Skip to content

Commit

Permalink
feat: add support for Series|Expr.skew method (#1173)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: FBruzzesi <francesco.bruzzesi.93@gmail.com>
Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Nov 23, 2024
1 parent afd92dc commit 35c34f4
Show file tree
Hide file tree
Showing 11 changed files with 284 additions and 13 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
- sample
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
- shape
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ def n_unique(self) -> Self:
def std(self, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self: Self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def cast(self, dtype: DType) -> Self:
return reuse_series_implementation(self, "cast", dtype)

Expand Down
20 changes: 19 additions & 1 deletion narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,29 @@ def shift(self, n: int) -> Self:
result = ca
return self._from_native_series(result)

def std(self, ddof: int = 1) -> int:
def std(self, ddof: int = 1) -> float:
import pyarrow.compute as pc # ignore-banned-import()

return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return]

def skew(self: Self) -> float | None:
import pyarrow.compute as pc # ignore-banned-import()

ser = self._native_series
ser_not_null = pc.drop_null(ser)
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
elif len(ser_not_null) == 2:
return 0.0
else:
m = pc.subtract(ser_not_null, pc.mean(ser_not_null))
m2 = pc.mean(pc.power(m, 2))
m3 = pc.mean(pc.power(m, 3))
# Biased population skewness
return pc.divide(m3, pc.power(m2, 1.5)) # type: ignore[no-any-return]

def count(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down
7 changes: 7 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,13 @@ def std(self, ddof: int = 1) -> Self:
returns_scalar=True,
)

def skew(self: Self) -> Self:
return self._from_call(
lambda _input: _input.skew(),
"skew",
returns_scalar=True,
)

def shift(self, n: int) -> Self:
return self._from_call(
lambda _input, n: _input.shift(n),
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ def median(self) -> Self:
def std(self, *, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self: Self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def any(self) -> Self:
return reuse_series_implementation(self, "any", returns_scalar=True)

Expand Down
23 changes: 17 additions & 6 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,13 +454,24 @@ def median(self) -> Any:
ser = self._native_series
return ser.median()

def std(
self,
*,
ddof: int = 1,
) -> Any:
def std(self: Self, *, ddof: int = 1) -> float:
ser = self._native_series
return ser.std(ddof=ddof)
return ser.std(ddof=ddof) # type: ignore[no-any-return]

def skew(self: Self) -> float | None:
ser = self._native_series
ser_not_null = ser.dropna()
if len(ser_not_null) == 0:
return None
elif len(ser_not_null) == 1:
return float("nan")
elif len(ser_not_null) == 2:
return 0.0
else:
m = ser_not_null - ser_not_null.mean()
m2 = (m**2).mean()
m3 = (m**3).mean()
return m3 / (m2**1.5) if m2 != 0 else float("nan")

def len(self) -> Any:
return len(self._native_series)
Expand Down
47 changes: 46 additions & 1 deletion narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ def std(self, *, ddof: int = 1) -> Self:
"""Get standard deviation.
Arguments:
ddof: Delta Degrees of Freedom: the divisor used in the calculation is N - ddof,
ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements. By default ddof is 1.
Returns:
Expand Down Expand Up @@ -743,6 +743,51 @@ def map_batches(
)
)

def skew(self: Self) -> Self:
"""Calculate the sample skewness of a column.
Returns:
An expression representing the sample skewness of the column.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pa = pa.Table.from_pandas(df_pd)
Let's define a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(df):
... return df.select(nw.col("a", "b").skew())
We can then pass pandas, Polars, or PyArrow to `func`:
>>> func(df_pd)
a b
0 0.0 1.472427
>>> func(df_pl)
shape: (1, 2)
┌─────┬──────────┐
│ a ┆ b │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═════╪══════════╡
│ 0.0 ┆ 1.472427 │
└─────┴──────────┘
>>> func(df_pa)
pyarrow.Table
a: double
b: double
----
a: [[0]]
b: [[1.4724267269058975]]
"""
return self.__class__(lambda plx: self._call(plx).skew())

def sum(self) -> Expr:
"""Return the sum value.
Expand Down
35 changes: 35 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,41 @@ def median(self) -> Any:
"""
return self._compliant_series.median()

def skew(self: Self) -> Any:
"""Calculate the sample skewness of the Series.
Returns:
The sample skewness of the Series.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> s = [1, 1, 2, 10, 100]
>>> s_pd = pd.Series(s)
>>> s_pl = pl.Series(s)
>>> s_pa = pa.array(s)
We define a library agnostic function:
>>> @nw.narwhalify
... def func(s):
... return s.skew()
We can pass any supported library such as Pandas, Polars, or PyArrow to `func`:
>>> func(s_pd)
np.float64(1.4724267269058975)
>>> func(s_pl)
1.4724267269058975
Notes:
The skewness is a measure of the asymmetry of the probability distribution.
A perfectly symmetric distribution has a skewness of 0.
"""
return self._compliant_series.skew()

def count(self) -> Any:
"""Returns the number of non-null elements in the Series.
Expand Down
26 changes: 26 additions & 0 deletions tests/expr_and_series/skew_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import annotations

import pytest

import narwhals.stable.v1 as nw
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = [1, 2, 3, 2, 1]


@pytest.mark.parametrize(
("data", "expected"),
[
([], None),
([1], float("nan")),
([1, 2], 0.0),
([0.0, 0.0, 0.0], float("nan")),
([1, 2, 3, 2, 1], 0.343622),
],
)
def test_skew_series(
constructor_eager: ConstructorEager, data: list[float], expected: float | None
) -> None:
result = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"].skew()
assert_equal_data({"a": [result]}, {"a": [expected]})
Loading

0 comments on commit 35c34f4

Please sign in to comment.