Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Groupby selection context not being properly reset #28541

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
@@ -376,6 +376,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`)
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
- Bug in :meth:`DataFrameGroupby` causing unexpected mutations of the groupby object (:issue:`28523`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the ref needs to be :class:`pandas.core.groupby.DataFrameGroupBy`. And maybe briefly explain the user-visible behavior: "Multiple operations on a DataFrameGroupBy object not giving the same results in certain cases".

- Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)

Reshaping
197 changes: 99 additions & 98 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
@@ -594,63 +594,63 @@ def pipe(self, func, *args, **kwargs):
def _make_wrapper(self, name):
assert name in self._apply_whitelist

self._set_group_selection()

# need to setup the selection
# as are not passed directly but in the grouper
f = getattr(self._selected_obj, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))

f = getattr(type(self._selected_obj), name)
sig = inspect.signature(f)

def wrapper(*args, **kwargs):
# a little trickery for aggregation functions that need an axis
# argument
if "axis" in sig.parameters:
if kwargs.get("axis", None) is None:
kwargs["axis"] = self.axis

def curried(x):
return f(x, *args, **kwargs)
with _group_selection_context(self):
# need to setup the selection
# as are not passed directly but in the grouper
f = getattr(self._selected_obj, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))

f = getattr(type(self._selected_obj), name)
sig = inspect.signature(f)

def wrapper(*args, **kwargs):
# a little trickery for aggregation functions that need an axis
# argument
if "axis" in sig.parameters:
if kwargs.get("axis", None) is None:
kwargs["axis"] = self.axis

def curried(x):
return f(x, *args, **kwargs)

# preserve the name so we can detect it when calling plot methods,
# to avoid duplicates
curried.__name__ = name

# special case otherwise extra plots are created when catching the
# exception below
if name in base.plotting_methods:
return self.apply(curried)

# preserve the name so we can detect it when calling plot methods,
# to avoid duplicates
curried.__name__ = name
try:
return self.apply(curried)
except TypeError as err:
if not re.search(
"reduction operation '.*' not allowed for this dtype", str(err)
):
# We don't have a cython implementation
# TODO: is the above comment accurate?
raise

# special case otherwise extra plots are created when catching the
# exception below
if name in base.plotting_methods:
return self.apply(curried)
# related to : GH3688
# try item-by-item
# this can be called recursively, so need to raise
# ValueError
# if we don't have this method to indicated to aggregate to
# mark this column as an error
try:
return self._aggregate_item_by_item(name, *args, **kwargs)
except AttributeError:
# e.g. SparseArray has no flags attr
# FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item'
# occurs in idxmax() case
# in tests.groupby.test_function.test_non_cython_api
raise ValueError

try:
return self.apply(curried)
except TypeError as err:
if not re.search(
"reduction operation '.*' not allowed for this dtype", str(err)
):
# We don't have a cython implementation
# TODO: is the above comment accurate?
raise

# related to : GH3688
# try item-by-item
# this can be called recursively, so need to raise
# ValueError
# if we don't have this method to indicated to aggregate to
# mark this column as an error
try:
return self._aggregate_item_by_item(name, *args, **kwargs)
except AttributeError:
# e.g. SparseArray has no flags attr
# FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item'
# occurs in idxmax() case
# in tests.groupby.test_function.test_non_cython_api
raise ValueError
wrapper.__name__ = name

wrapper.__name__ = name
return wrapper
return wrapper

def get_group(self, name, obj=None):
"""
@@ -721,7 +721,9 @@ def f(g):
f = func

# ignore SettingWithCopy here in case the user mutates
with option_context("mode.chained_assignment", None):
with option_context(
"mode.chained_assignment", None
) as _, _group_selection_context(self) as _:
try:
result = self._python_apply_general(f)
except TypeError:
@@ -732,9 +734,7 @@ def f(g):
# except if the udf is trying an operation that
# fails on *some* columns, e.g. a numeric operation
# on a string grouper column

with _group_selection_context(self):
return self._python_apply_general(f)
return self._python_apply_general(f)

return result

@@ -1335,35 +1335,34 @@ def f(self, **kwargs):
if "min_count" not in kwargs:
kwargs["min_count"] = min_count

self._set_group_selection()

# try a cython aggregation if we can
try:
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except DataError:
pass
except NotImplementedError as err:
if "function is not implemented for this dtype" in str(err):
# raised in _get_cython_function, in some cases can
# be trimmed by implementing cython funcs for more dtypes
pass
elif "decimal does not support skipna=True" in str(err):
# FIXME: kludge for test_decimal:test_in_numeric_groupby
with _group_selection_context(self):
# try a cython aggregation if we can
try:
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except DataError:
pass
except NotImplementedError as err:
if "function is not implemented for this dtype" in str(err):
# raised in _get_cython_function, in some cases can
# be trimmed by implementing cython funcs for more dtypes
pass
elif "decimal does not support skipna=True" in str(err):
# FIXME: kludge for test_decimal:test_in_numeric_groupby
pass
else:
raise

# apply a non-cython aggregation
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))

# coerce the resulting columns if we can
if isinstance(result, DataFrame):
for col in result.columns:
result[col] = self._try_cast(result[col], self.obj[col])
else:
raise

# apply a non-cython aggregation
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))

# coerce the resulting columns if we can
if isinstance(result, DataFrame):
for col in result.columns:
result[col] = self._try_cast(result[col], self.obj[col])
else:
result = self._try_cast(result, self.obj)
result = self._try_cast(result, self.obj)

return result
return result

set_function_name(f, name, cls)

@@ -1736,28 +1735,30 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
nth_values = list(set(n))

nth_array = np.array(nth_values, dtype=np.intp)
self._set_group_selection()

mask_left = np.in1d(self._cumcount_array(), nth_array)
mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array)
mask = mask_left | mask_right
with _group_selection_context(self):
mask_left = np.in1d(self._cumcount_array(), nth_array)
mask_right = np.in1d(
self._cumcount_array(ascending=False) + 1, -nth_array
)
mask = mask_left | mask_right

ids, _, _ = self.grouper.group_info
ids, _, _ = self.grouper.group_info

# Drop NA values in grouping
mask = mask & (ids != -1)
# Drop NA values in grouping
mask = mask & (ids != -1)

out = self._selected_obj[mask]
if not self.as_index:
return out
out = self._selected_obj[mask]
if not self.as_index:
return out

result_index = self.grouper.result_index
out.index = result_index[ids[mask]]
result_index = self.grouper.result_index
out.index = result_index[ids[mask]]

if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)
if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)

return out.sort_index() if self.sort else out
return out.sort_index() if self.sort else out

# dropna is truthy
if isinstance(n, valid_containers):
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
@@ -482,13 +482,13 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[0]).drop("A", 1).iloc[0, 0]

ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
assert ts == grouped.apply(lambda x: x.iloc[-1]).drop("A", 1).iloc[0, 0]


def test_sum_uint64_overflow():
17 changes: 17 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
@@ -433,6 +433,23 @@ def test_frame_groupby_columns(tsframe):
assert len(v.columns) == 2


def test_frame_groupby_avoids_mutate(reduction_func):
# GH28523
func = reduction_func
df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": [1, 2, 3, 4]})
grouped = df.groupby("A")

expected = grouped.apply(lambda x: x)

args = {"nth": [0], "quantile": [0.5]}.get(func, [])
fn = getattr(grouped, func)
fn(*args)

result = grouped.apply(lambda x: x)

tm.assert_frame_equal(expected, result)


def test_frame_set_name_single(df):
grouped = df.groupby("A")

2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
@@ -172,7 +172,7 @@ def test_grouper_creation_bug(self):
result = g.sum()
assert_frame_equal(result, expected)

result = g.apply(lambda x: x.sum())
result = g.apply(lambda x: x.sum()).drop("A", 1)
assert_frame_equal(result, expected)

g = df.groupby(pd.Grouper(key="A", axis=0))