From 966f9be1091b04047101af0e251ed9ebd44cbcf6 Mon Sep 17 00:00:00 2001 From: "D.S. McNeil" Date: Tue, 21 Mar 2017 22:13:29 -0400 Subject: [PATCH] ENH: add .ngroup() method to groupby objects (#14026) Closes #11642 --- doc/source/api.rst | 1 + doc/source/groupby.rst | 34 ++++ doc/source/whatsnew/v0.20.0.txt | 19 +- pandas/core/groupby.py | 56 ++++++ pandas/tests/groupby/test_groupby.py | 249 ++++++++++++++++++++------- 5 files changed, 299 insertions(+), 60 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 33ac5fde651d44..c8b83f77f7030e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1706,6 +1706,7 @@ Computations / Descriptive Stats GroupBy.mean GroupBy.median GroupBy.min + GroupBy.ngroup GroupBy.nth GroupBy.ohlc GroupBy.prod diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 8484ccd69a983e..73adeccbe79dea 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1087,6 +1087,23 @@ To see the order in which each row appears within its group, use the df.groupby('A').cumcount(ascending=False) # kwarg only +Enumerate groups +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +To see the ordering of the groups themselves, you can use the ``ngroup`` +method: + +.. ipython:: python + + df = pd.DataFrame(list('aaabba'), columns=['A']) + df + + df.groupby('A').ngroup() + + df.groupby('A').ngroup(ascending=False) # kwarg only + Plotting ~~~~~~~~ @@ -1178,3 +1195,20 @@ column index name will be used as the name of the inserted column: result result.stack() + +Multi-column factorization +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By using ``.ngroup()``, we can extract information about the groups in a +way similar to ``pd.factorize()``, but which applies naturally to multiple +columns of mixed type and different sources: + +.. ipython::python + + df = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")}) + + df + + df.groupby(["A", "B"]).ngroup() + + df.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4e528daa6e876a..b5cffe59eec4f0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -56,8 +56,8 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. .. _whatsnew_0200.enhancements.groupby_access: -Groupby Enhancements -^^^^^^^^^^^^^^^^^^^^ +Groupby Access Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) @@ -75,6 +75,21 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() +.. _whatsnew_0200.enhancements.groupby_ngroup: + +Groupby Group Numbers +^^^^^^^^^^^^^^^^^^^^^ + +A new groupby method ``ngroup``, parallel to the existing ``cumcount``, has been added to return the group order (:issue:`11642`). + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 1, 2, 3, 3], "B": list("aaaba")}) + + df.groupby("A").ngroup() + + df.groupby(["A", "B"]).ngroup() + .. _whatsnew_0200.enhancements.compressed_urls: Better support for compressed URLs in ``read_csv`` diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 43c57a88b4d194..9f1b6a4429287c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1363,6 +1363,62 @@ def nth(self, n, dropna=None): return result + @Substitution(name='groupby') + @Appender(_doc_template) + def ngroup(self, ascending=True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Examples + -------- + + >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').ngroup() + 0 0 + 1 0 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 + >>> df.groupby('A').ngroup(ascending=False) + 0 1 + 1 1 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + """ + + self._set_group_selection() + + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result + @Substitution(name='groupby') @Appender(_doc_template) def cumcount(self, ascending=True): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e8469637328836..c8a865e4164364 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3590,60 +3590,6 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3]) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_empty(self): - ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) - - # edge case, as this is usually considered float - e = Series(dtype='int64') - - assert_series_equal(e, ge.cumcount()) - assert_series_equal(e, se.cumcount()) - - def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_mi(self): - mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=mi) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby([0, 0, 0, 1, 0]) - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - def test_fill_constistency(self): # GH9221 @@ -3906,10 +3852,11 @@ def test_tab_completion(self): 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', + 'bfill', 'ffill', 'take', 'tshift', 'pct_change', 'any', + 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', + 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', + 'rolling', 'expanding']) self.assertEqual(results, expected) def test_lower_int_prec_count(self): @@ -4304,6 +4251,192 @@ def test_cummin_cummax(self): tm.assert_series_equal(expected, result) +class TestCounting(tm.TestCase): + + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_ngroup(self): + df = DataFrame({"A": list("aaaba")}) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0]) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_distinct(self): + df = DataFrame({"A": list("abcde")}) + g = df.groupby('A') + sg = g.A + + expected = Series(range(5), dtype='int64') + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_one_group(self): + df = DataFrame({"A": [0] * 5}) + g = df.groupby('A') + sg = g.A + + expected = Series([0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.ngroup()) + assert_series_equal(e, se.ngroup()) + + def test_ngroup_dupe_index(self): + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame({"A": list("aaaba")}, index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=mi) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_groupby_not_col(self): + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_descending(self): + df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) + g = df.groupby(['A']) + + ascending = Series([0, 0, 1, 0, 1]) + descending = Series([1, 1, 0, 1, 0]) + + assert_series_equal(descending, (g.ngroups - 1) - ascending) + assert_series_equal(ascending, g.ngroup(ascending=True)) + assert_series_equal(descending, g.ngroup(ascending=False)) + + def test_ngroup_matches_cumcount(self): + # specific case + df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], + ['a', 'x'], ['b', 'y']], columns=['A', 'X']) + g = df.groupby(['A', 'X']) + + g_ngroup = g.ngroup() + g_cumcount = g.cumcount() + expected_ngroup = pd.Series([0, 1, 2, 0, 3]) + expected_cumcount = pd.Series([0, 0, 0, 1, 0]) + + assert_series_equal(g_ngroup, expected_ngroup) + assert_series_equal(g_cumcount, expected_cumcount) + + def test_ngroup_cumcount_pair(self): + from itertools import product + + # brute force comparison, inefficient but clear + for p in product(range(3), repeat=4): + df = DataFrame({'a': p}) + g = df.groupby(['a']) + + order = sorted(set(p)) + ngroupd = [order.index(val) for val in p] + cumcounted = [p[:i].count(val) for i, val in enumerate(p)] + + assert_series_equal(g.ngroup(), pd.Series(ngroupd)) + assert_series_equal(g.cumcount(), pd.Series(cumcounted)) + + def test_ngroup_respects_groupby_order(self): + np.random.seed(0) + df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + for sort_flag in (False, True): + g = df.groupby(['a'], sort=sort_flag) + df['group_id'] = -1 + df['group_index'] = -1 + + for i, (key, group) in enumerate(g): + df.loc[group.index, 'group_id'] = i + for j, ind in enumerate(group.index): + df.loc[ind, 'group_index'] = j + + assert_series_equal(pd.Series(df['group_id'].values), + g.ngroup()) + assert_series_equal(pd.Series(df['group_index'].values), + g.cumcount()) + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups)