Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for Series|Expr.skew method #1173

Merged
merged 25 commits into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
90d9742
Implement skew for Arrow, Pandas-like and Polars
CarloLepelaars Oct 14, 2024
c82fec1
Fix doctests
CarloLepelaars Oct 14, 2024
e118e4d
Remove skew in namespace. Remove n > 3 requirement. Fix expr doc
CarloLepelaars Oct 14, 2024
2530f81
Use biases population skewness
CarloLepelaars Oct 14, 2024
fc37529
Add pyarrow example for skew Expr
CarloLepelaars Oct 15, 2024
be2f503
Merge branch 'main' into feat/skew
CarloLepelaars Oct 15, 2024
02fdb4c
Fix: Add a_skew to schema
CarloLepelaars Oct 15, 2024
895be9c
Use native operation for PandasLikeSeries skew. Dask skew expr
CarloLepelaars Oct 17, 2024
a3b71bc
Use native pyarrow operations for skew
CarloLepelaars Oct 17, 2024
9ed06d7
Merge branch 'main' into feat/skew
CarloLepelaars Oct 17, 2024
4ff077d
Simplify arrow skew. non-trivial example for series.skew.
CarloLepelaars Oct 18, 2024
11efd49
unary_test with nan data. 2 element and 1 element unary tests
CarloLepelaars Oct 18, 2024
26a64f8
Fix doctest for Series skew
CarloLepelaars Oct 18, 2024
2014036
Make skew nan policy consistent with Polars
CarloLepelaars Oct 23, 2024
aaada24
Merge branch 'main' into feat/skew
FBruzzesi Oct 29, 2024
3e7eeab
merge main
FBruzzesi Oct 29, 2024
7f6fe07
merge main and add test for coverage
FBruzzesi Nov 12, 2024
2f2912c
Merge branch 'main' into feat/skew
FBruzzesi Nov 13, 2024
082664f
match RuntimeWarning for dask only
FBruzzesi Nov 13, 2024
56299a3
Merge remote-tracking branch 'upstream/main' into feat/skew
MarcoGorelli Nov 23, 2024
2dac3f3
stay pyarrow-native longer
MarcoGorelli Nov 23, 2024
0d3b6ec
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 23, 2024
7f91b19
fix mistake
MarcoGorelli Nov 23, 2024
3399530
Merge branch 'feat/skew' of github.com:CarloLepelaars/narwhals into f…
MarcoGorelli Nov 23, 2024
87f71d7
doctest
MarcoGorelli Nov 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
- sample
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
- shape
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ def n_unique(self) -> Self:
def std(self, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def cast(self, dtype: DType) -> Self:
return reuse_series_implementation(self, "cast", dtype)

Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def skew(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).skew()

CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
def max(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
12 changes: 12 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals._arrow.utils import narwhals_to_native_dtype
from narwhals._arrow.utils import native_to_narwhals_dtype
from narwhals._arrow.utils import validate_column_comparand
from narwhals.dependencies import get_numpy
from narwhals.utils import Implementation
from narwhals.utils import generate_unique_token

Expand Down Expand Up @@ -298,6 +299,17 @@ def std(self, ddof: int = 1) -> int:

return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return]

def skew(self) -> float:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although it would end up returning a pyarrow scalar, I think we should keep the implementation with native methods, or you can reuse methods implemented, such as all elementary operations

values = self._native_series.to_numpy()
np = get_numpy()
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
m = np.mean(values)
s = np.std(values)
n = len(values)
if n < 3:
return float("nan")
g1 = np.sum((values - m) ** 3) / (n * s**3)
return float(g1) # Population skewness

def count(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def mean(self) -> Self:
def std(self, *, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def any(self) -> Self:
return reuse_series_implementation(self, "any", returns_scalar=True)

Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ def mean(self, *column_names: str) -> PandasLikeExpr:
dtypes=self._dtypes,
).mean()

def skew(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
implementation=self._implementation,
backend_version=self._backend_version,
dtypes=self._dtypes,
).skew()
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved

def max(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
Expand Down
19 changes: 19 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from narwhals._pandas_like.utils import set_axis
from narwhals._pandas_like.utils import to_datetime
from narwhals._pandas_like.utils import validate_column_comparand
from narwhals.dependencies import get_numpy
from narwhals.dependencies import get_pandas
from narwhals.utils import Implementation

if TYPE_CHECKING:
Expand Down Expand Up @@ -424,6 +426,23 @@ def std(
ser = self._native_series
return ser.std(ddof=ddof)

def skew(self) -> Any:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
np = get_numpy()
pd = get_pandas()
ser = self._native_series
if hasattr(ser, "skew") and not isinstance(ser.dtype, pd.ArrowDtype):
return float(ser.skew())
else:
values = ser.to_numpy()
n = len(values)
if n < 3:
return float("nan")
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
m = np.mean(values)
m2 = np.mean((values - m) ** 2)
m3 = np.mean((values - m) ** 3)
g1 = m3 / (m2**1.5)
return float(g1) # Population skewness

def len(self) -> Any:
return len(self._native_series)

Expand Down
9 changes: 8 additions & 1 deletion narwhals/_polars/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from typing import Sequence

from narwhals._expression_parsing import parse_into_exprs
from narwhals._polars.expr import PolarsExpr
from narwhals._polars.utils import extract_args_kwargs
from narwhals._polars.utils import narwhals_to_native_dtype
from narwhals.utils import Implementation

if TYPE_CHECKING:
from narwhals._polars.dataframe import PolarsDataFrame
from narwhals._polars.dataframe import PolarsLazyFrame
from narwhals._polars.expr import PolarsExpr
from narwhals._polars.typing import IntoPolarsExpr
from narwhals.dtypes import DType
from narwhals.typing import DTypes
Expand Down Expand Up @@ -98,6 +98,13 @@ def mean(self, *column_names: str) -> PolarsExpr:
return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type]
return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes)

def skew(self, *column_names: str) -> PolarsExpr:
import polars as pl # ignore-banned-import()

if self._backend_version < (0, 20, 4): # pragma: no cover
return PolarsExpr(pl.skew([*column_names]), dtypes=self._dtypes)
return PolarsExpr(pl.skew(*column_names), dtypes=self._dtypes)
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved

def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr:
import polars as pl # ignore-banned-import()

Expand Down
39 changes: 38 additions & 1 deletion narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def std(self, *, ddof: int = 1) -> Self:
Get standard deviation.

Arguments:
ddof: Delta Degrees of Freedom: the divisor used in the calculation is N - ddof,
ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements. By default ddof is 1.

Examples:
Expand Down Expand Up @@ -433,6 +433,43 @@ def std(self, *, ddof: int = 1) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).std(ddof=ddof))

def skew(self) -> Self:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
"""
Calculate the sample skewness of a column.

Returns:
An expression representing the sample skewness of the column.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})

Let's define a dataframe-agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.col("a", "b").skew())

We can then pass either pandas or Polars to `func`:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved

>>> func(df_pd)
a b
0 0.0 2.194964
>>> func(df_pl)
shape: (1, 2)
┌─────┬──────────┐
│ a ┆ b │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═════╪══════════╡
│ 0.0 ┆ 1.472427 │
└─────┴──────────┘
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
"""
return self.__class__(lambda plx: self._call(plx).skew())

def sum(self) -> Expr:
"""
Return the sum value.
Expand Down
34 changes: 34 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,40 @@ def mean(self) -> Any:
"""
return self._compliant_series.mean()

def skew(self) -> Any:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as Expr.skew, polars exposes a bias parameter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See conversation in narwhals/expr.py

"""
Calculate the sample skewness of the Series.

Returns:
The sample skewness of the Series.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> s = [1, 2, 3, 4, 5]
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
>>> s_pd = pd.Series(s)
>>> s_pl = pl.Series(s)

We define a library agnostic function:

>>> @nw.narwhalify
... def func(s):
... return s.skew()

We can then pass either pandas or Polars to `func`:

>>> func(s_pd)
0.0
>>> func(s_pl)
0.0

Notes:
The skewness is a measure of the asymmetry of the probability distribution.
A perfectly symmetric distribution has a skewness of 0.
"""
return self._compliant_series.skew()

def count(self) -> Any:
"""
Returns the number of non-null elements in the Series.
Expand Down
14 changes: 10 additions & 4 deletions tests/expr_and_series/unary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None
.with_columns(
a_mean=nw.col("a").mean(),
a_sum=nw.col("a").sum(),
a_skew=nw.col("a").skew(),
b_nunique=nw.col("b").n_unique(),
z_min=nw.col("z").min(),
z_max=nw.col("z").max(),
)
.select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
.select(
nw.col("a_mean", "a_sum", "a_skew", "b_nunique", "z_min", "z_max").unique()
)
)
expected = {
"a_mean": [2],
"a_sum": [6],
"a_skew": [0.0],
"b_nunique": [2],
"z_min": [7],
"z_max": [9],
Expand All @@ -38,15 +42,17 @@ def test_unary_series(constructor_eager: Any) -> None:
result = {
"a_mean": [df["a"].mean()],
"a_sum": [df["a"].sum()],
"a_skew": [df["a"].skew()],
"b_nunique": [df["b"].n_unique()],
"z_min": [df["z"].min()],
"z_max": [df["z"].max()],
}
expected = {
"a_mean": [2],
"a_mean": [2.0],
"a_sum": [6],
"a_skew": [0.0],
"b_nunique": [2],
"z_min": [7],
"z_max": [9],
"z_min": [7.0],
"z_max": [9.0],
}
compare_dicts(result, expected)
Loading