Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: clean up groupby / categorical #21753

Merged
merged 2 commits into from
Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 0 additions & 67 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False):

self._dtype = new_dtype

def _codes_for_groupby(self, sort, observed):
"""
Code the categories to ensure we can groupby for categoricals.

If observed=True, we return a new Categorical with the observed
categories only.

If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.

This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).

Parameters
----------
sort : boolean
The value of the sort parameter groupby was called with.
observed : boolean
Account only for the observed values

Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
"""

# we only care about observed values
if observed:
unique_codes = unique1d(self.codes)
cat = self.copy()

take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = np.sort(take_codes)

# we recode according to the uniques
categories = self.categories.take(take_codes)
codes = _recode_for_categories(self.codes,
self.categories,
categories)

# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=self.ordered)
return type(self)(codes, dtype=dtype, fastpath=True)

# Already sorted according to self.categories; all is fine
if sort:
return self

# sort=False should order groups in as-encountered order (GH-8868)
cat = self.unique()

# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat.add_categories(
self.categories[~self.categories.isin(cat.categories)],
inplace=True)

return self.reorder_categories(cat.categories)

def _set_dtype(self, dtype):
"""Internal method for directly updating the CategoricalDtype

Expand Down
99 changes: 99 additions & 0 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
_recode_for_categories, CategoricalDtype, Categorical)


def recode_for_groupby(c, sort, observed):
"""
Code the categories to ensure we can groupby for categoricals.

If observed=True, we return a new Categorical with the observed
categories only.

If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.

This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).

Parameters
----------
c : Categorical
sort : boolean
The value of the sort parameter groupby was called with.
observed : boolean
Account only for the observed values

Returns
-------
New Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
Categorical or None
If we are observed, return the original categorical, otherwise None
"""

# we only care about observed values
if observed:
unique_codes = unique1d(c.codes)

take_codes = unique_codes[unique_codes != -1]
if c.ordered:
take_codes = np.sort(take_codes)

# we recode according to the uniques
categories = c.categories.take(take_codes)
codes = _recode_for_categories(c.codes,
c.categories,
categories)

# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=c.ordered)
return Categorical(codes, dtype=dtype, fastpath=True), c

# Already sorted according to c.categories; all is fine
if sort:
return c, None

# sort=False should order groups in as-encountered order (GH-8868)
cat = c.unique()

# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat = cat.add_categories(
c.categories[~c.categories.isin(cat.categories)])

return c.reorder_categories(cat.categories), None


def recode_from_groupby(c, sort, ci):
"""
Reverse the codes_to_groupby to account for sort / observed.

Parameters
----------
c : Categorical
sort : boolean
The value of the sort parameter groupby was called with.
ci : CategoricalIndex
The codes / categories to recode

Returns
-------
CategoricalIndex
"""

# we re-order to the original category orderings
if sort:
return ci.set_categories(c.categories)

# we are not sorting, so add unobserved to the end
return ci.add_categories(
c.categories[~c.categories.isin(ci.categories)])
20 changes: 6 additions & 14 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# a passed Categorical
elif is_categorical_dtype(self.grouper):

self.all_grouper = self.grouper
self.grouper = self.grouper._codes_for_groupby(
self.sort, observed)
from pandas.core.groupby.categorical import recode_for_groupby
self.grouper, self.all_grouper = recode_for_groupby(
self.grouper, self.sort, observed)
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
Expand Down Expand Up @@ -3073,17 +3073,9 @@ def labels(self):
@cache_readonly
def result_index(self):
if self.all_grouper is not None:
all_categories = self.all_grouper.categories

# we re-order to the original category orderings
if self.sort:
return self.group_index.set_categories(all_categories)

# we are not sorting, so add unobserved to the end
categories = self.group_index.categories
return self.group_index.add_categories(
all_categories[~all_categories.isin(categories)])

from pandas.core.groupby.categorical import recode_from_groupby
return recode_from_groupby(self.all_grouper,
self.sort, self.group_index)
return self.group_index

@property
Expand Down