From 966f9be1091b04047101af0e251ed9ebd44cbcf6 Mon Sep 17 00:00:00 2001
From: "D.S. McNeil" <dsm054@gmail.com>
Date: Tue, 21 Mar 2017 22:13:29 -0400
Subject: [PATCH] ENH: add .ngroup() method to groupby objects (#14026)

Closes #11642
---
 doc/source/api.rst                   |   1 +
 doc/source/groupby.rst               |  34 ++++
 doc/source/whatsnew/v0.20.0.txt      |  19 +-
 pandas/core/groupby.py               |  56 ++++++
 pandas/tests/groupby/test_groupby.py | 249 ++++++++++++++++++++-------
 5 files changed, 299 insertions(+), 60 deletions(-)

diff --git a/doc/source/api.rst b/doc/source/api.rst
index 33ac5fde651d44..c8b83f77f7030e 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -1706,6 +1706,7 @@ Computations / Descriptive Stats
    GroupBy.mean
    GroupBy.median
    GroupBy.min
+   GroupBy.ngroup
    GroupBy.nth
    GroupBy.ohlc
    GroupBy.prod
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
index 8484ccd69a983e..73adeccbe79dea 100644
--- a/doc/source/groupby.rst
+++ b/doc/source/groupby.rst
@@ -1087,6 +1087,23 @@ To see the order in which each row appears within its group, use the
 
    df.groupby('A').cumcount(ascending=False)  # kwarg only
 
+Enumerate groups
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.20.0
+
+To see the ordering of the groups themselves, you can use the ``ngroup``
+method:
+
+.. ipython:: python
+
+   df = pd.DataFrame(list('aaabba'), columns=['A'])
+   df
+
+   df.groupby('A').ngroup()
+
+   df.groupby('A').ngroup(ascending=False)  # kwarg only
+
 Plotting
 ~~~~~~~~
 
@@ -1178,3 +1195,20 @@ column index name will be used as the name of the inserted column:
    result
 
    result.stack()
+
+Multi-column factorization
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By using ``.ngroup()``, we can extract information about the groups in a
+way similar to ``pd.factorize()``, but which applies naturally to multiple
+columns of mixed type and different sources:
+
+.. ipython::python
+
+    df = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})
+
+    df
+
+    df.groupby(["A", "B"]).ngroup()
+
+    df.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 4e528daa6e876a..b5cffe59eec4f0 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -56,8 +56,8 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files.
 
 .. _whatsnew_0200.enhancements.groupby_access:
 
-Groupby Enhancements
-^^^^^^^^^^^^^^^^^^^^
+Groupby Access Enhancements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`)
 
@@ -75,6 +75,21 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
 
    df.groupby(['second', 'A']).sum()
 
+.. _whatsnew_0200.enhancements.groupby_ngroup:
+
+Groupby Group Numbers
+^^^^^^^^^^^^^^^^^^^^^
+
+A new groupby method ``ngroup``, parallel to the existing ``cumcount``, has been added to return the group order (:issue:`11642`).
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [1, 1, 2, 3, 3], "B": list("aaaba")})
+
+   df.groupby("A").ngroup()
+
+   df.groupby(["A", "B"]).ngroup()
+
 .. _whatsnew_0200.enhancements.compressed_urls:
 
 Better support for compressed URLs in ``read_csv``
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 43c57a88b4d194..9f1b6a4429287c 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -1363,6 +1363,62 @@ def nth(self, n, dropna=None):
 
         return result
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def ngroup(self, ascending=True):
+        """
+        Number each group from 0 to the number of groups - 1.
+
+        This is the enumerative complement of cumcount.  Note that the
+        numbers given to the groups match the order in which the groups
+        would be seen when iterating over the groupby object, not the
+        order they are first observed.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from number of group - 1 to 0.
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame({"A": list("aaabba")})
+        >>> df
+           A
+        0  a
+        1  a
+        2  a
+        3  b
+        4  b
+        5  a
+        >>> df.groupby('A').ngroup()
+        0    0
+        1    0
+        2    0
+        3    1
+        4    1
+        5    0
+        dtype: int64
+        >>> df.groupby('A').ngroup(ascending=False)
+        0    1
+        1    1
+        2    1
+        3    0
+        4    0
+        5    1
+        dtype: int64
+        """
+
+        self._set_group_selection()
+
+        index = self._selected_obj.index
+        result = Series(self.grouper.group_info[0], index)
+        if not ascending:
+            result = self.ngroups - 1 - result
+        return result
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def cumcount(self, ascending=True):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index e8469637328836..c8a865e4164364 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -3590,60 +3590,6 @@ def test_groupby_with_small_elem(self):
         res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
         tm.assert_frame_equal(res, df.iloc[[2], :])
 
-    def test_cumcount(self):
-        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
-        g = df.groupby('A')
-        sg = g.A
-
-        expected = Series([0, 1, 2, 0, 3])
-
-        assert_series_equal(expected, g.cumcount())
-        assert_series_equal(expected, sg.cumcount())
-
-    def test_cumcount_empty(self):
-        ge = DataFrame().groupby(level=0)
-        se = Series().groupby(level=0)
-
-        # edge case, as this is usually considered float
-        e = Series(dtype='int64')
-
-        assert_series_equal(e, ge.cumcount())
-        assert_series_equal(e, se.cumcount())
-
-    def test_cumcount_dupe_index(self):
-        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
-                       index=[0] * 5)
-        g = df.groupby('A')
-        sg = g.A
-
-        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
-
-        assert_series_equal(expected, g.cumcount())
-        assert_series_equal(expected, sg.cumcount())
-
-    def test_cumcount_mi(self):
-        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
-        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
-                       index=mi)
-        g = df.groupby('A')
-        sg = g.A
-
-        expected = Series([0, 1, 2, 0, 3], index=mi)
-
-        assert_series_equal(expected, g.cumcount())
-        assert_series_equal(expected, sg.cumcount())
-
-    def test_cumcount_groupby_not_col(self):
-        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
-                       index=[0] * 5)
-        g = df.groupby([0, 0, 0, 1, 0])
-        sg = g.A
-
-        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
-
-        assert_series_equal(expected, g.cumcount())
-        assert_series_equal(expected, sg.cumcount())
-
     def test_fill_constistency(self):
 
         # GH9221
@@ -3906,10 +3852,11 @@ def test_tab_completion(self):
              'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
              'nunique', 'head', 'describe', 'cummax', 'quantile',
              'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
-             'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
-             'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
-             'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
-             'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
+             'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew',
+             'bfill', 'ffill', 'take', 'tshift', 'pct_change', 'any',
+             'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff',
+             'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill',
+             'rolling', 'expanding'])
         self.assertEqual(results, expected)
 
     def test_lower_int_prec_count(self):
@@ -4304,6 +4251,192 @@ def test_cummin_cummax(self):
             tm.assert_series_equal(expected, result)
 
 
+class TestCounting(tm.TestCase):
+
+    def test_cumcount(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3])
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_empty(self):
+        ge = DataFrame().groupby(level=0)
+        se = Series().groupby(level=0)
+
+        # edge case, as this is usually considered float
+        e = Series(dtype='int64')
+
+        assert_series_equal(e, ge.cumcount())
+        assert_series_equal(e, se.cumcount())
+
+    def test_cumcount_dupe_index(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=[0] * 5)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=mi)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=mi)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_groupby_not_col(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=[0] * 5)
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_ngroup(self):
+        df = DataFrame({"A": list("aaaba")})
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0])
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_distinct(self):
+        df = DataFrame({"A": list("abcde")})
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series(range(5), dtype='int64')
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_one_group(self):
+        df = DataFrame({"A": [0] * 5})
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0] * 5)
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_empty(self):
+        ge = DataFrame().groupby(level=0)
+        se = Series().groupby(level=0)
+
+        # edge case, as this is usually considered float
+        e = Series(dtype='int64')
+
+        assert_series_equal(e, ge.ngroup())
+        assert_series_equal(e, se.ngroup())
+
+    def test_ngroup_dupe_index(self):
+        df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame({"A": list("aaaba")}, index=mi)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=mi)
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_groupby_not_col(self):
+        df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        assert_series_equal(expected, g.ngroup())
+        assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_descending(self):
+        df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
+        g = df.groupby(['A'])
+
+        ascending = Series([0, 0, 1, 0, 1])
+        descending = Series([1, 1, 0, 1, 0])
+
+        assert_series_equal(descending, (g.ngroups - 1) - ascending)
+        assert_series_equal(ascending, g.ngroup(ascending=True))
+        assert_series_equal(descending, g.ngroup(ascending=False))
+
+    def test_ngroup_matches_cumcount(self):
+        # specific case
+        df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
+                        ['a', 'x'], ['b', 'y']], columns=['A', 'X'])
+        g = df.groupby(['A', 'X'])
+
+        g_ngroup = g.ngroup()
+        g_cumcount = g.cumcount()
+        expected_ngroup = pd.Series([0, 1, 2, 0, 3])
+        expected_cumcount = pd.Series([0, 0, 0, 1, 0])
+
+        assert_series_equal(g_ngroup, expected_ngroup)
+        assert_series_equal(g_cumcount, expected_cumcount)
+
+    def test_ngroup_cumcount_pair(self):
+        from itertools import product
+
+        # brute force comparison, inefficient but clear
+        for p in product(range(3), repeat=4):
+            df = DataFrame({'a': p})
+            g = df.groupby(['a'])
+
+            order = sorted(set(p))
+            ngroupd = [order.index(val) for val in p]
+            cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
+
+            assert_series_equal(g.ngroup(), pd.Series(ngroupd))
+            assert_series_equal(g.cumcount(), pd.Series(cumcounted))
+
+    def test_ngroup_respects_groupby_order(self):
+        np.random.seed(0)
+        df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
+        for sort_flag in (False, True):
+            g = df.groupby(['a'], sort=sort_flag)
+            df['group_id'] = -1
+            df['group_index'] = -1
+
+            for i, (key, group) in enumerate(g):
+                df.loc[group.index, 'group_id'] = i
+                for j, ind in enumerate(group.index):
+                    df.loc[ind, 'group_index'] = j
+
+            assert_series_equal(pd.Series(df['group_id'].values),
+                                g.ngroup())
+            assert_series_equal(pd.Series(df['group_index'].values),
+                                g.cumcount())
+
+
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)
     tups = com._asarray_tuplesafe(tups)