Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GroupBy enhancement unifies the return of iterating over GroupBy #42795 #47719

Closed
wants to merge 72 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
54c5068
DOC #45443 edited the documentation of where/mask functions
ahmedibrhm Jul 8, 2022
2951fb1
DOC #45443 edited the documentation of where/mask functions
ahmedibrhm Jul 8, 2022
6335204
Merge branch 'main' into main
ahmedibrhm Jul 8, 2022
8afd6a1
Update generic.py
ahmedibrhm Jul 8, 2022
6a7ede4
Merge branch 'pandas-dev:main' into main
ahmedibrhm Jul 13, 2022
eb0ed28
Merge branch 'pandas-dev:main' into main
ahmedibrhm Jul 14, 2022
83ca209
groupby enahn
ahmedibrhm Jul 14, 2022
a0b3a59
fixing pivot
ahmedibrhm Jul 14, 2022
153bbe5
fixing ops
ahmedibrhm Jul 14, 2022
43d1f92
syntax
ahmedibrhm Jul 14, 2022
242468c
editting test apply
ahmedibrhm Jul 14, 2022
9717f5d
removing testing lines
ahmedibrhm Jul 14, 2022
5d7331e
edit pivot
ahmedibrhm Jul 14, 2022
a6829a7
Merge branch 'main' into issue6
ahmedibrhm Jul 14, 2022
b0abd59
Merge branch 'issue6' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 14, 2022
c709311
edit pivot lib
ahmedibrhm Jul 14, 2022
4e14c87
edit pivot
ahmedibrhm Jul 14, 2022
09dec70
pivot
ahmedibrhm Jul 14, 2022
d1e9525
editting ops
ahmedibrhm Jul 15, 2022
6b5d26b
skipping tests for changing the inputs
ahmedibrhm Jul 15, 2022
b7c797a
adding tests
ahmedibrhm Jul 15, 2022
600cdd9
adding test groupby
ahmedibrhm Jul 15, 2022
1cb253d
editing tests
ahmedibrhm Jul 15, 2022
c0ef8b6
tests
ahmedibrhm Jul 15, 2022
ef05b5b
tests
ahmedibrhm Jul 15, 2022
bd157c0
tests
ahmedibrhm Jul 15, 2022
0e10d13
testing
ahmedibrhm Jul 15, 2022
a4d58c8
Merge branch 'main' into issue6
ahmedibrhm Jul 15, 2022
7efc7ef
pivot editing
ahmedibrhm Jul 15, 2022
fa15243
Merge branch 'issue6' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 15, 2022
dd070b2
test_string
ahmedibrhm Jul 15, 2022
e995f37
base
ahmedibrhm Jul 15, 2022
7951358
base
ahmedibrhm Jul 15, 2022
47b7564
Merge branch 'main' into issue6
ahmedibrhm Jul 15, 2022
fa26f37
test
ahmedibrhm Jul 15, 2022
12cb573
Merge branch 'issue6' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 15, 2022
614a91e
editing array
ahmedibrhm Jul 16, 2022
6c848f2
Merge branch 'main' into issue6
ahmedibrhm Jul 16, 2022
79944f4
Merge branch 'main' into issue6
ahmedibrhm Jul 16, 2022
070c726
editting ops
ahmedibrhm Jul 17, 2022
cc267bb
cond none
ahmedibrhm Jul 17, 2022
7b37e3f
syntax
ahmedibrhm Jul 17, 2022
4f59ca8
comment
ahmedibrhm Jul 17, 2022
698e494
old ex.
ahmedibrhm Jul 17, 2022
330c3c3
editing ops
ahmedibrhm Jul 17, 2022
eb8db3f
editing tests
ahmedibrhm Jul 17, 2022
9c37aa3
indentations
ahmedibrhm Jul 17, 2022
207f44b
Merge branch 'main' into issue6
ahmedibrhm Jul 17, 2022
958edf5
flake 8
ahmedibrhm Jul 17, 2022
73efbab
cond ops
ahmedibrhm Jul 17, 2022
6f52b26
Merge branch 'issue6' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 17, 2022
190eaab
syntax
ahmedibrhm Jul 17, 2022
986a1e9
past ex
ahmedibrhm Jul 17, 2022
cd74c7e
editting input for plot module
ahmedibrhm Jul 18, 2022
6fc7371
unness. import
ahmedibrhm Jul 18, 2022
dafef53
syntax
ahmedibrhm Jul 18, 2022
a3a0dd2
core plot
ahmedibrhm Jul 18, 2022
9fb479f
ops
ahmedibrhm Jul 18, 2022
6464033
editting plotting
ahmedibrhm Jul 18, 2022
d745717
hist and box
ahmedibrhm Jul 18, 2022
b788c0e
box
ahmedibrhm Jul 18, 2022
143466e
hist fix
ahmedibrhm Jul 18, 2022
56f5aa3
boxplot
ahmedibrhm Jul 18, 2022
93889dd
bp
ahmedibrhm Jul 18, 2022
a12a6fb
box and hist
ahmedibrhm Jul 19, 2022
fdea4a8
plotting
ahmedibrhm Jul 19, 2022
871fec8
Merge branch 'main' into issue6
ahmedibrhm Jul 19, 2022
ebf7b92
Update _core.py
ahmedibrhm Jul 19, 2022
6ea317e
Merge branch 'main' into issue6
ahmedibrhm Jul 20, 2022
20c65a7
unnecessary tests
ahmedibrhm Jul 26, 2022
78e1e04
Merge branch 'issue6' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 26, 2022
43a89e9
Merge branch 'main' into issue6
ahmedibrhm Jul 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,8 @@ def __getitem__(self, item: PositionalIndexer):
)
# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
if type(item) == np.int64:
item = item.item()
value = self._data[item]
if isinstance(value, pa.ChunkedArray):
return type(self)(value)
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,11 @@ def get_grouper(
"""
group_axis = obj._get_axis(axis)

tuple_unified = False
if isinstance(key, list):
if len(key) == 1 and isinstance(key[0], str):
tuple_unified = True

# validate that the passed single level is compatible with the passed
# axis of the object
if level is not None:
Expand Down Expand Up @@ -918,7 +923,12 @@ def is_in_obj(gpr) -> bool:

# create the internals grouper
grouper = ops.BaseGrouper(
group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna
group_axis,
groupings,
tuple_unified=tuple_unified,
sort=sort,
mutated=mutated,
dropna=dropna,
)
return grouper, frozenset(exclusions), obj

Expand Down
15 changes: 9 additions & 6 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,7 @@ def __init__(
self,
axis: Index,
groupings: Sequence[grouper.Grouping],
tuple_unified: bool = False,
sort: bool = True,
group_keys: bool = True,
mutated: bool = False,
Expand All @@ -721,6 +722,7 @@ def __init__(

self.axis = axis
self._groupings: list[grouper.Grouping] = list(groupings)
self.tuple_unified = tuple_unified
self._sort = sort
self.group_keys = group_keys
self.mutated = mutated
Expand Down Expand Up @@ -779,13 +781,13 @@ def _get_grouper(self):
@final
@cache_readonly
def group_keys_seq(self):
if len(self.groupings) == 1:
if len(self.groupings) == 1 and self.tuple_unified is False:
return self.levels[0]
else:
ids, _, ngroups = self.group_info

# provide "flattened" iterator for multi-group setting
return get_flattened_list(ids, ngroups, self.levels, self.codes)
ids, _, ngroups = self.group_info

# provide "flattened" iterator for multi-group setting
return get_flattened_list(ids, ngroups, self.levels, self.codes)

@final
def apply(
Expand Down Expand Up @@ -1123,12 +1125,13 @@ def __init__(
binlabels,
mutated: bool = False,
indexer=None,
tuple_unified: bool = False,
) -> None:
self.bins = ensure_int64(bins)
self.binlabels = ensure_index(binlabels)
self.mutated = mutated
self.indexer = indexer

self.tuple_unified = False
# These lengths must match, otherwise we could call agg_series
# with empty self.bins, which would raise in libreduction.
assert len(self.binlabels) == len(self.bins)
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ def __internal_pivot_table(
pass
values = list(values)

if isinstance(keys, list):
if len(keys) == 1:
keys = keys[0]
grouped = data.groupby(keys, observed=observed, sort=sort)
agged = grouped.agg(aggfunc)
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
Expand Down Expand Up @@ -367,7 +370,11 @@ def _all_key(key):
margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
cat_axis = 1

for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
for keys, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
if isinstance(keys, tuple):
(key,) = keys
else:
key = keys
all_key = _all_key(key)

# we are going to mutate this, so need to copy!
Expand Down
10 changes: 9 additions & 1 deletion pandas/plotting/_matplotlib/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def __init__(
# For `hist` plot, need to get grouped original data before `self.data` is
# updated later
if self.by is not None and self._kind == "hist":
self._grouped = data.groupby(self.by)
bymodi = fix_groupby_singlelist_input(by)
self._grouped = data.groupby(bymodi)

self.kind = kind

Expand Down Expand Up @@ -1832,3 +1833,10 @@ def blank_labeler(label, value):
leglabels = labels if labels is not None else idx
for p, l in zip(patches, leglabels):
self._append_legend_handles_labels(p, l)


def fix_groupby_singlelist_input(keys):
if isinstance(keys, list):
if len(keys) == 1 and isinstance(keys[0], str):
keys = keys[0]
return keys
10 changes: 9 additions & 1 deletion pandas/plotting/_matplotlib/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def reconstruct_data_with_by(
1 3.0 4.0 NaN NaN
2 NaN NaN 5.0 6.0
"""
grouped = data.groupby(by)
bymodi = fix_groupby_singlelist_input(by)
grouped = data.groupby(bymodi)

data_list = []
for key, group in grouped:
Expand All @@ -134,3 +135,10 @@ def reformat_hist_y_given_by(
if by is not None and len(y.shape) > 1:
return np.array([remove_na_arraylike(col) for col in y.T]).T
return remove_na_arraylike(y)


def fix_groupby_singlelist_input(keys):
if isinstance(keys, list):
if len(keys) == 1 and isinstance(keys[0], str):
keys = keys[0]
return keys
12 changes: 11 additions & 1 deletion pandas/plotting/_matplotlib/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def _args_adjust(self):
# where subplots are created based on by argument
if is_integer(self.bins):
if self.by is not None:
grouped = self.data.groupby(self.by)[self.columns]
bymodi = fix_groupby_singlelist_input(self.by)
grouped = self.data.groupby(bymodi)[self.columns]
Comment on lines +70 to +71
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because .hist and .box use groupby internally in a single way. For example if I did hist by ['a','b','c','d'] the results will be like (a,), (b,), (c,), (d,).
some plotting functions and the pivot table are actually iterating over groupby.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But here, grouped is only being used in L66 immediately below, right?

self.bins = [self._calculate_bins(group) for key, group in grouped]

In this usage, only the group is being used and the key is ignored. So why is this needed if it's only the key changing?

self.bins = [self._calculate_bins(group) for key, group in grouped]
else:
self.bins = self._calculate_bins(self.data)
Expand Down Expand Up @@ -271,6 +272,8 @@ def _grouped_plot(
grouped = data.groupby(by)
if column is not None:
grouped = grouped[column]
if isinstance(by, list) and len(by) == 1:
by = [by]

naxes = len(grouped)
fig, axes = create_subplots(
Expand Down Expand Up @@ -528,3 +531,10 @@ def hist_frame(
maybe_adjust_figure(fig, wspace=0.3, hspace=0.3)

return axes


def fix_groupby_singlelist_input(keys):
if isinstance(keys, list):
if len(keys) == 1 and isinstance(keys[0], str):
keys = keys[0]
return keys
18 changes: 18 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2795,3 +2795,21 @@ def test_groupby_none_column_name():
result = df.groupby(by=[None]).sum()
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
tm.assert_frame_equal(result, expected)


def test_groupby_iterator_one_grouper():
df = DataFrame(columns=["a", "b", "c"], index=["x", "y"])
df.loc["y"] = Series({"a": 1, "b": 5, "c": 2})
expected = True

values, _ = next(iter(df.groupby(["a", "b"])))
result = isinstance(values, tuple)
assert result == expected

values, _ = next(iter(df.groupby(["a"])))
result = isinstance(values, tuple)
assert result == expected

values, _ = next(iter(df.groupby("a")))
result = isinstance(values, int)
assert result == expected
4 changes: 3 additions & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,9 @@ def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix
# some smoke tests
for c in join_col:
assert result[c].notna().all()

if isinstance(join_col, list):
if len(join_col) == 1:
join_col = join_col[0]
left_grouped = left.groupby(join_col)
right_grouped = right.groupby(join_col)

Expand Down