From 01d18209be9c2ba8f23e52a041beab2d0fb36851 Mon Sep 17 00:00:00 2001 From: ruiann <534676033@qq.com> Date: Sun, 1 Oct 2017 12:48:56 -0500 Subject: [PATCH] BUG:Time Grouper bug fix when applied for list groupers (#17587) closes #17530 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/groupby.py | 119 ++++++++++++++++++----- pandas/core/resample.py | 27 +---- pandas/tests/groupby/test_timegrouper.py | 19 ++++ 4 files changed, 116 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1094e96bd0d201..3276310fa3e6e2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -704,6 +704,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) - Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) +- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2f2056279558d3..9379ade4be7a69 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): """ Parameters ---------- obj : the subject object + validate : boolean, default True + if True, validate the grouper Returns ------- @@ -271,7 +273,8 @@ def _get_grouper(self, obj): self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], axis=self.axis, level=self.level, - sort=self.sort) + sort=self.sort, + validate=validate) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): @@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False): self.grouper = ax return self.grouper - def _get_binner_for_grouping(self, obj): - """ default to the standard binner here """ - group_axis = obj._get_axis(self.axis) - return Grouping(group_axis, None, obj=obj, name=self.key, - level=self.level, sort=self.sort, in_axis=False) - @property def groups(self): return self.grouper.groups @@ -1733,16 +1730,34 @@ class BaseGrouper(object): """ This is an internal Grouper class, which actually holds the generated groups + + Parameters + ---------- + axis : int + the axis to group + groupings : array of grouping + all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : boolean, default True + whether this grouper will give sorted result or not + group_keys : boolean, default True + mutated : boolean, default False + indexer : intp array, optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + """ def __init__(self, axis, groupings, sort=True, group_keys=True, - mutated=False): + mutated=False, indexer=None): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings self.sort = sort self.group_keys = group_keys self.mutated = mutated + self.indexer = indexer @property def shape(self): @@ -1888,6 +1903,15 @@ def group_info(self): comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups + @cache_readonly + def label_info(self): + # return the labels of items in original grouped axis + labels, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((labels, self.indexer)) + labels = labels[sorter] + return labels + def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: @@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): - def __init__(self, bins, binlabels, filter_empty=False, mutated=False): + """ + This is an internal Grouper class + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + filter_empty : boolean, default False + mutated : boolean, default False + indexer : a intp array + + Examples + -------- + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on + + """ + + def __init__(self, bins, binlabels, filter_empty=False, mutated=False, + indexer=None): self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated + self.indexer = indexer @cache_readonly def groups(self): @@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get labels + elif isinstance(self.grouper, Grouper): + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) + if self.name is None: + self.name = grouper.result_index.name + self.obj = self.grouper.obj + self.grouper = grouper + else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] @@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, categories=c, ordered=self.grouper.ordered)) - # a passed Grouper like - elif isinstance(self.grouper, Grouper): - - # get the new grouper - grouper = self.grouper._get_binner_for_grouping(self.obj) - self.obj = self.grouper.obj - self.grouper = grouper - if self.name is None: - self.name = grouper.name - # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper @@ -2536,6 +2594,10 @@ def ngroups(self): @cache_readonly def indices(self): + # we have a list of groupers + if isinstance(self.grouper, BaseGrouper): + return self.grouper.indices + values = _ensure_categorical(self.grouper) return values._reverse_indexer() @@ -2553,9 +2615,14 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: - labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) - uniques = Index(uniques, name=self.name) + # we have a list of groupers + if isinstance(self.grouper, BaseGrouper): + labels = self.grouper.label_info + uniques = self.grouper.result_index + else: + labels, uniques = algorithms.factorize( + self.grouper, sort=self.sort) + uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @@ -2566,7 +2633,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False): + mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, are and then creates a Grouping for each one, combined into a BaseGrouper. + If validate, then check for key/level overlaps + """ group_axis = obj._get_axis(axis) @@ -2707,7 +2776,7 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if gpr in obj.index.names: + if validate and gpr in obj.index.names: warnings.warn( ("'%s' is both a column name and an index level.\n" "Defaulting to column but " diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 083fbcaaabe460..6edbb99641542d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -250,7 +250,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() - bin_grouper = BinGrouper(bins, binlabels) + bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper def _assure_grouper(self): @@ -1105,35 +1105,12 @@ def _get_resampler(self, obj, kind=None): "TimedeltaIndex or PeriodIndex, " "but got an instance of %r" % type(ax).__name__) - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() return r.binner, r.grouper, r.obj - def _get_binner_for_grouping(self, obj): - # return an ordering of the transformed group labels, - # suitable for multi-grouping, e.g the labels for - # the resampled intervals - binner, grouper, obj = self._get_grouper(obj) - - l = [] - for key, group in grouper.get_iterator(self.ax): - l.extend([key] * len(group)) - - if isinstance(self.ax, PeriodIndex): - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) - else: - # resampling causes duplicated values, specifying freq is invalid - grouper = binner.__class__(l, name=binner.name) - - # since we may have had to sort - # may need to reorder groups here - if self.indexer is not None: - indexer = self.indexer.argsort(kind='quicksort') - grouper = grouper.take(indexer) - return grouper - def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index fafcbf947e3df7..c8503b16a0e16a 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -623,3 +623,22 @@ def test_nunique_with_timegrouper_and_nat(self): result = test.groupby(grouper)['data'].nunique() expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() tm.assert_series_equal(result, expected) + + def test_scalar_call_versus_list_call(self): + # Issue: 17530 + data_frame = { + 'location': ['shanghai', 'beijing', 'shanghai'], + 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', + '2017-08-11 22:23:15'], + dtype='datetime64[ns]'), + 'value': [1, 2, 3] + } + data_frame = pd.DataFrame(data_frame).set_index('time') + grouper = pd.Grouper(freq='D') + + grouped = data_frame.groupby(grouper) + result = grouped.count() + grouped = data_frame.groupby([grouper]) + expected = grouped.count() + + assert_frame_equal(result, expected)