From 2dfbe0ceccda6445f9572856680f77a75a61d468 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 27 Sep 2022 00:51:45 +0200 Subject: [PATCH] Backport PR #48782 on branch 1.5.x (REGR: describe raising when result contains NA) (#48793) Backport PR #48782: REGR: describe raising when result contains NA Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.5.1.rst | 1 + pandas/core/describe.py | 11 ++++++++++- pandas/tests/frame/methods/test_describe.py | 12 ++++++++++++ pandas/tests/series/methods/test_describe.py | 12 ++++++++++-- 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index da0bd746e3da5..6798c7074228c 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -72,6 +72,7 @@ Fixed regressions - Fixed Regression in :meth:`Series.__setitem__` casting ``None`` to ``NaN`` for object dtype (:issue:`48665`) - Fixed Regression in :meth:`DataFrame.loc` when setting values as a :class:`DataFrame` with all ``True`` indexer (:issue:`48701`) - Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`) +- Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`) - Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`) - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`) - diff --git a/pandas/core/describe.py b/pandas/core/describe.py index d6546b06ec711..e6f567b123b59 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -24,6 +24,7 @@ from pandas._libs.tslibs import Timestamp from pandas._typing import ( + DtypeObj, NDFrameT, npt, ) @@ -34,10 +35,12 @@ is_bool_dtype, is_complex_dtype, is_datetime64_any_dtype, + is_extension_array_dtype, is_numeric_dtype, is_timedelta64_dtype, ) +import pandas as pd from pandas.core.reshape.concat import concat from pandas.io.formats.format import format_percentiles @@ -242,7 +245,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + [series.max()] ) # GH#48340 - always return float on non-complex numeric data - dtype = float if is_numeric_dtype(series) and not is_complex_dtype(series) else None + dtype: DtypeObj | None + if is_extension_array_dtype(series): + dtype = pd.Float64Dtype() + elif is_numeric_dtype(series) and not is_complex_dtype(series): + dtype = np.dtype("float") + else: + dtype = None return Series(d, index=stat_index, name=series.name, dtype=dtype) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 3a1228ee5c4a5..24d327a101143 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -397,3 +397,15 @@ def test_describe_with_duplicate_columns(self): ser = df.iloc[:, 0].describe() expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1) tm.assert_frame_equal(result, expected) + + def test_ea_with_na(self, any_numeric_ea_dtype): + # GH#48778 + + df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype) + result = df.describe() + expected = DataFrame( + {"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype="Float64", + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index d7650e2768781..a7cedd580b2d0 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,6 +1,9 @@ import numpy as np -from pandas.core.dtypes.common import is_complex_dtype +from pandas.core.dtypes.common import ( + is_complex_dtype, + is_extension_array_dtype, +) from pandas import ( Period, @@ -154,6 +157,11 @@ def test_datetime_is_numeric_includes_datetime(self): def test_numeric_result_dtype(self, any_numeric_dtype): # GH#48340 - describe should always return float on non-complex numeric input + if is_extension_array_dtype(any_numeric_dtype): + dtype = "Float64" + else: + dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None + ser = Series([0, 1], dtype=any_numeric_dtype) result = ser.describe() expected = Series( @@ -168,6 +176,6 @@ def test_numeric_result_dtype(self, any_numeric_dtype): 1.0, ], index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - dtype="complex128" if is_complex_dtype(ser) else None, + dtype=dtype, ) tm.assert_series_equal(result, expected)