diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 3ca53934b..91f49a0a1 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -51,6 +51,7 @@ - sample - shift - sort + - skew - std - sum - tail diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 7e7c85230..7c7a5ed17 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -60,6 +60,7 @@ - shape - shift - sort + - skew - std - sum - tail diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index d9ee5a361..4c04a1827 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -227,6 +227,9 @@ def n_unique(self) -> Self: def std(self, ddof: int = 1) -> Self: return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True) + def skew(self: Self) -> Self: + return reuse_series_implementation(self, "skew", returns_scalar=True) + def cast(self, dtype: DType) -> Self: return reuse_series_implementation(self, "cast", dtype) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 4ced6da54..ee06f1fa0 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -305,11 +305,29 @@ def shift(self, n: int) -> Self: result = ca return self._from_native_series(result) - def std(self, ddof: int = 1) -> int: + def std(self, ddof: int = 1) -> float: import pyarrow.compute as pc # ignore-banned-import() return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return] + def skew(self: Self) -> float | None: + import pyarrow.compute as pc # ignore-banned-import() + + ser = self._native_series + ser_not_null = pc.drop_null(ser) + if len(ser_not_null) == 0: + return None + elif len(ser_not_null) == 1: + return float("nan") + elif len(ser_not_null) == 2: + return 0.0 + else: + m = pc.subtract(ser_not_null, pc.mean(ser_not_null)) + m2 = pc.mean(pc.power(m, 2)) + m3 = pc.mean(pc.power(m, 3)) + # Biased population skewness + return pc.divide(m3, pc.power(m2, 1.5)) # type: ignore[no-any-return] + def count(self) -> int: import pyarrow.compute as pc # ignore-banned-import() diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 92d670908..58e73792a 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -433,6 +433,13 @@ def std(self, ddof: int = 1) -> Self: returns_scalar=True, ) + def skew(self: Self) -> Self: + return self._from_call( + lambda _input: _input.skew(), + "skew", + returns_scalar=True, + ) + def shift(self, n: int) -> Self: return self._from_call( lambda _input, n: _input.shift(n), diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index ebbc05fe5..182ea980f 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -236,6 +236,9 @@ def median(self) -> Self: def std(self, *, ddof: int = 1) -> Self: return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True) + def skew(self: Self) -> Self: + return reuse_series_implementation(self, "skew", returns_scalar=True) + def any(self) -> Self: return reuse_series_implementation(self, "any", returns_scalar=True) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 0fca8ca4f..c8520529a 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -454,13 +454,24 @@ def median(self) -> Any: ser = self._native_series return ser.median() - def std( - self, - *, - ddof: int = 1, - ) -> Any: + def std(self: Self, *, ddof: int = 1) -> float: ser = self._native_series - return ser.std(ddof=ddof) + return ser.std(ddof=ddof) # type: ignore[no-any-return] + + def skew(self: Self) -> float | None: + ser = self._native_series + ser_not_null = ser.dropna() + if len(ser_not_null) == 0: + return None + elif len(ser_not_null) == 1: + return float("nan") + elif len(ser_not_null) == 2: + return 0.0 + else: + m = ser_not_null - ser_not_null.mean() + m2 = (m**2).mean() + m3 = (m**3).mean() + return m3 / (m2**1.5) if m2 != 0 else float("nan") def len(self) -> Any: return len(self._native_series) diff --git a/narwhals/expr.py b/narwhals/expr.py index afa998db8..a43485e09 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -623,7 +623,7 @@ def std(self, *, ddof: int = 1) -> Self: """Get standard deviation. Arguments: - ddof: “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. Returns: @@ -743,6 +743,51 @@ def map_batches( ) ) + def skew(self: Self) -> Self: + """Calculate the sample skewness of a column. + + Returns: + An expression representing the sample skewness of the column. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) + >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) + >>> df_pa = pa.Table.from_pandas(df_pd) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.col("a", "b").skew()) + + We can then pass pandas, Polars, or PyArrow to `func`: + + >>> func(df_pd) + a b + 0 0.0 1.472427 + >>> func(df_pl) + shape: (1, 2) + ┌─────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════╡ + │ 0.0 ┆ 1.472427 │ + └─────┴──────────┘ + >>> func(df_pa) + pyarrow.Table + a: double + b: double + ---- + a: [[0]] + b: [[1.4724267269058975]] + """ + return self.__class__(lambda plx: self._call(plx).skew()) + def sum(self) -> Expr: """Return the sum value. diff --git a/narwhals/series.py b/narwhals/series.py index ac827303e..a5224ceae 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -657,6 +657,41 @@ def median(self) -> Any: """ return self._compliant_series.median() + def skew(self: Self) -> Any: + """Calculate the sample skewness of the Series. + + Returns: + The sample skewness of the Series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> s = [1, 1, 2, 10, 100] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + >>> s_pa = pa.array(s) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.skew() + + We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> func(s_pd) + np.float64(1.4724267269058975) + >>> func(s_pl) + 1.4724267269058975 + + Notes: + The skewness is a measure of the asymmetry of the probability distribution. + A perfectly symmetric distribution has a skewness of 0. + """ + return self._compliant_series.skew() + def count(self) -> Any: """Returns the number of non-null elements in the Series. diff --git a/tests/expr_and_series/skew_test.py b/tests/expr_and_series/skew_test.py new file mode 100644 index 000000000..b2029d08e --- /dev/null +++ b/tests/expr_and_series/skew_test.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +data = [1, 2, 3, 2, 1] + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([], None), + ([1], float("nan")), + ([1, 2], 0.0), + ([0.0, 0.0, 0.0], float("nan")), + ([1, 2, 3, 2, 1], 0.343622), + ], +) +def test_skew_series( + constructor_eager: ConstructorEager, data: list[float], expected: float | None +) -> None: + result = nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"].skew() + assert_equal_data({"a": [result]}, {"a": [expected]}) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 9a0c4caaa..3a580b726 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,5 +1,9 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -7,12 +11,20 @@ def test_unary(constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "c": [7.0, 8.0, None], + "z": [7.0, 8, 9], + } result = nw.from_native(constructor(data)).select( a_mean=nw.col("a").mean(), a_median=nw.col("a").median(), a_sum=nw.col("a").sum(), + a_skew=nw.col("a").skew(), b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) @@ -20,7 +32,10 @@ def test_unary(constructor: Constructor) -> None: "a_mean": [2], "a_median": [2], "a_sum": [6], + "a_skew": [0.0], "b_nunique": [2], + "b_skew": [0.7071067811865465], + "c_nunique": [3], "z_min": [7], "z_max": [9], } @@ -28,22 +43,128 @@ def test_unary(constructor: Constructor) -> None: def test_unary_series(constructor_eager: ConstructorEager) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "c": [7.0, 8.0, None], + "z": [7.0, 8, 9], + } df = nw.from_native(constructor_eager(data), eager_only=True) result = { "a_mean": [df["a"].mean()], "a_median": [df["a"].median()], "a_sum": [df["a"].sum()], + "a_skew": [df["a"].skew()], "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], "z_min": [df["z"].min()], "z_max": [df["z"].max()], } expected = { - "a_mean": [2], + "a_mean": [2.0], "a_median": [2], "a_sum": [6], + "a_skew": [0.0], "b_nunique": [2], - "z_min": [7], - "z_max": [9], + "b_skew": [0.7071067811865465], + "c_nunique": [3], + "c_skew": [0.0], + "z_min": [7.0], + "z_max": [9.0], + } + assert_equal_data(result, expected) + + +def test_unary_two_elements(constructor: Constructor) -> None: + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} + result = nw.from_native(constructor(data)).select( + a_nunique=nw.col("a").n_unique(), + a_skew=nw.col("a").skew(), + b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), + c_skew=nw.col("c").skew(), + ) + expected = { + "a_nunique": [2], + "a_skew": [0.0], + "b_nunique": [2], + "b_skew": [0.0], + "c_nunique": [2], + "c_skew": [float("nan")], + } + assert_equal_data(result, expected) + + +def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} + df = nw.from_native(constructor_eager(data), eager_only=True) + result = { + "a_nunique": [df["a"].n_unique()], + "a_skew": [df["a"].skew()], + "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], + } + expected = { + "a_nunique": [2], + "a_skew": [0.0], + "b_nunique": [2], + "b_skew": [0.0], + "c_nunique": [2], + "c_skew": [float("nan")], + } + assert_equal_data(result, expected) + + +def test_unary_one_element(constructor: Constructor) -> None: + data = {"a": [1], "b": [2], "c": [float("nan")]} + # Dask runs into a divide by zero RuntimeWarning for 1 element skew. + context = ( + pytest.warns(RuntimeWarning, match="invalid value encountered in scalar divide") + if "dask" in str(constructor) + else does_not_raise() + ) + with context: + result = nw.from_native(constructor(data)).select( + a_nunique=nw.col("a").n_unique(), + a_skew=nw.col("a").skew(), + b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), + c_skew=nw.col("c").skew(), + ) + expected = { + "a_nunique": [1], + "a_skew": [float("nan")], + "b_nunique": [1], + "b_skew": [float("nan")], + "c_nunique": [1], + "c_skew": [float("nan")], + } + assert_equal_data(result, expected) + + +def test_unary_one_element_series(constructor_eager: ConstructorEager) -> None: + data = {"a": [1], "b": [2], "c": [float("nan")]} + df = nw.from_native(constructor_eager(data)) + result = { + "a_nunique": [df["a"].n_unique()], + "a_skew": [df["a"].skew()], + "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], + } + expected = { + "a_nunique": [1], + "a_skew": [float("nan")], + "b_nunique": [1], + "b_skew": [float("nan")], + "c_nunique": [1], + "c_skew": [float("nan")], } assert_equal_data(result, expected)