diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6721523b9b429..711352775400e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1165,7 +1165,7 @@ Groupby/resample/rolling - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) - Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) -- Bug in :meth:`.GroupBy.sum` and :meth:`.GroupBy.cumsum` with integer dtypes losing precision (:issue:`37493`) +- Bug in :meth:`.GroupBy.sum`, :meth:`.GroupBy.prod` and :meth:`.GroupBy.cumsum` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) - Bug in :meth:`.GroupBy.cumsum` with integer dtypes causing overflows when sum was bigger than maximum of dtype (:issue:`37493`) - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index c8e9df6cd6b38..04db0c9b90bc5 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -63,10 +63,12 @@ def group_sum( is_datetimelike: bool = ..., ) -> None: ... def group_prod( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # int64float_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[floating, ndim=2] + values: np.ndarray, # ndarray[int64float_t, ndim=2] labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None, + result_mask: np.ndarray | None = ..., min_count: int = ..., ) -> None: ... def group_var( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 563abf949dbbc..299dfdf177d91 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -682,10 +682,12 @@ def group_sum( @cython.wraparound(False) @cython.boundscheck(False) def group_prod( - floating[:, ::1] out, + int64float_t[:, ::1] out, int64_t[::1] counts, - ndarray[floating, ndim=2] values, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, + const uint8_t[:, ::1] mask, + uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, ) -> None: """ @@ -693,10 +695,11 @@ def group_prod( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, ::1] prodx + int64float_t val, count + int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -716,15 +719,32 @@ def group_prod( for j in range(K): val = values[i, j] - # not nan - if val == val: + if uses_mask: + isna_entry = mask[i, j] + elif int64float_t is float32_t or int64float_t is float64_t: + isna_entry = not val == val + else: + isna_entry = False + + if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = NAN + + # else case is not possible + if uses_mask: + result_mask[i, j] = True + # Be deterministic, out was initialized as empty + out[i, j] = 0 + elif int64float_t is float32_t or int64float_t is float64_t: + out[i, j] = NAN + else: + # we only get here when < mincount which gets handled later + pass + else: out[i, j] = prodx[i, j] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 540825b33c073..418a222a0bfa6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -159,6 +159,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "sum", "ohlc", "cumsum", + "prod", } _cython_arity = {"ohlc": 4} # OHLC @@ -221,13 +222,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["var", "prod", "mean"] or ( + if how in ["var", "mean"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "cumsum"]: + elif how in ["sum", "ohlc", "prod", "cumsum"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) @@ -597,8 +598,16 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) - elif self.how == "ohlc": - func(result, counts, values, comp_ids, min_count, mask, result_mask) + elif self.how in ["ohlc", "prod"]: + func( + result, + counts, + values, + comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + ) else: func(result, counts, values, comp_ids, min_count, **kwargs) else: @@ -631,8 +640,8 @@ def _call_cython_op( # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: - # Neutral value for sum is 0, so don't fill empty groups with nan - cutoff = max(0 if self.how == "sum" else 1, min_count) + # if the op keeps the int dtypes, we have to use 0 + cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count) empty_groups = counts < cutoff if empty_groups.any(): if result_mask is not None and self.uses_mask(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1af94434ca1fa..ba39f76203623 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2847,8 +2847,8 @@ def test_single_element_list_grouping(): values, _ = next(iter(df.groupby(["a"]))) -@pytest.mark.parametrize("func", ["sum", "cumsum"]) -def test_groupby_sum_avoid_casting_to_float(func): +@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"]) +def test_groupby_avoid_casting_to_float(func): # GH#37493 val = 922337203685477580 df = DataFrame({"a": 1, "b": [val]}) @@ -2859,12 +2859,13 @@ def test_groupby_sum_avoid_casting_to_float(func): tm.assert_frame_equal(result, expected) -def test_groupby_sum_support_mask(any_numeric_ea_dtype): +@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)]) +def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val): # GH#37493 df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) - result = df.groupby("a").sum() + result = getattr(df.groupby("a"), func)() expected = DataFrame( - {"b": [3]}, + {"b": [val]}, index=Index([1], name="a", dtype=any_numeric_ea_dtype), dtype=any_numeric_ea_dtype, ) @@ -2887,6 +2888,14 @@ def test_groupby_overflow(val, dtype): expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64") tm.assert_frame_equal(result, expected) + result = df.groupby("a").prod() + expected = DataFrame( + {"b": [val * val]}, + index=Index([1], name="a", dtype=f"{dtype}64"), + dtype=f"{dtype}64", + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)]) def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val):