From 985013b6efbf1b877d337731aef467a3b6fe4854 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 29 May 2018 07:02:31 -0400 Subject: [PATCH 1/2] BUG: dropna incorrect with categoricals in pivot_table closes #21133 --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/reshape/pivot.py | 20 ++++++++++++++++++-- pandas/tests/reshape/test_pivot.py | 26 +++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 974527624a312..e679c0d305827 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -111,6 +111,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) +- Bug in :func:`pivot_table` with ``dropna=True``, an ordered ``Categorical`` for the index pivots and missing values in the ``values`` would give a mis-ordered result (:issue:`21133`) - Other diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e02420323704e..9a2ad5d13d77a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,8 +1,10 @@ # pylint: disable=E1103 -from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_list_like, is_scalar, is_integer_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=dropna) + # group by the cartesian product of the grouper + # if we have a categorical + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how='all') + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in [v for v in values if v in data and v in agged]: + if (is_integer_dtype(data[v]) and + not is_integer_dtype(agged[v])): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..3ec60d50f2792 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta @@ -16,6 +17,11 @@ from pandas.api.types import CategoricalDtype as CDT +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + class TestPivotTable(object): def setup_method(self, method): @@ -109,7 +115,6 @@ def test_pivot_table_categorical(self): index=exp_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], + categories=['low', 'high'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) From 683fd9e578bdfcd03d222551429cbf0fdaa8d76f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jun 2018 00:04:10 +0200 Subject: [PATCH 2/2] correct whatsnew message --- doc/source/whatsnew/v0.23.1.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9deb7603d5964..97a5975dad9a6 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -29,7 +29,8 @@ Fixed Regressions - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) -- Regression in :func:`pivot_table` where an ordered ``Categorical`` for the ``index`` and missing values in the ``values`` would give a mis-ordered result (:issue:`21133`) +- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing + values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) .. _whatsnew_0231.performance: