pandas-dev · jreback · May 30, 2019 · May 13, 2019 · May 15, 2019 · May 15, 2019
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -601,6 +601,49 @@ must be either implemented on GroupBy or available via :ref:`dispatching
    grouped.agg({'D': 'std', 'C': 'mean'})
    grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')]))
 
+.. _groupby.aggregate.keyword:
+
+.. versionadded:: 0.25.0
+
+To support column-specific aggregation with control over the output column names, pandas
+accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation", where
+
+- The keywords are the *output* column names
+- The values are tuples whose first element is the column to select
+  and the second element is the function to apply to that column.
+
+.. ipython:: python
+
+   animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                           'height': [9.1, 6.0, 9.5, 34.0],
+                           'weight': [7.9, 7.5, 9.9, 198.0]})
+   animals
+
+   animals.groupby("kind").agg(
+       min_height=('height', 'min'),
+       max_height=('height', 'max'),
+       average_weight=('height', np.mean),
+   )
+
+If your desired output column names are not valid python keywords, construct a dictionary
+and unpack the keyword arguments
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(**{
+       'total weight': ('weight', sum),
+   })
+
+Additional keyword arguments are not passed through to the aggregation functions. Only pairs
+of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions
+requires additional arguments, partially apply them with :meth:`functools.partial`.
+
+.. note::
+
+   For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not
+   preserved. Because the indeterminate keyword ordering would result in indeterminate
+   output column ordering, the output columns will always be sorted for Python 3.5.
+
 .. _groupby.aggregate.cython:
 
 Cython-optimized aggregation functions

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -19,6 +19,29 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog
 including other versions of pandas.
 
 
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_0250.enhancements.agg_relabel:
+
+Groupby Aggregation with Relabeling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Pandas has added special groupby behavior, known as "keyword aggregation", for naming the
+output columns when applying multiple aggregation functions to specific columns (:issue:`18366`).
+
+.. ipython:: python
+
+   df = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                      'height': [9.1, 6.0, 9.5, 34.0],
+                      'weight': [7.9, 7.5, 9.9, 198.0]})
+   grouper = df.groupby("kind")
+   grouper.agg(max_height=('height', 'max'), average_weight=('weight', 'mean'))
+
+Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs``
+should be tuples where the first element is the column selection, and the second element is the
+aggregation function to apply. See :ref:`_groupby.aggregate.keyword` for more.
+
 .. _whatsnew_0250.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 from pandas._libs import Timestamp, lib
+from pandas.compat import PY36
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import Appender, Substitution
 
@@ -144,8 +145,33 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
         return new_items, new_blocks
 
     def aggregate(self, func, *args, **kwargs):
-
         _level = kwargs.pop('_level', None)
+
+        relabeling = func is None and _is_multi_agg_with_relabel(**kwargs)
+        if relabeling:
+            if not PY36:
+                kwargs = OrderedDict(sorted(kwargs.items()))
+
+            # Normalize the aggregation functions as Dict[column, List[func]],
+            # process normally, then fixup the names.
+            # TODO(Py35): When we drop python 3.5, change this to
+            # defaultdict(list)
+            func = OrderedDict()
+            order = []
+            columns, pairs = list(zip(*kwargs.items()))
+
+            for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)):
+                if column in func:
+                    func[column].append(aggfunc)
+                else:
+                    func[column] = [aggfunc]
+                order.append((column, _get_agg_name(aggfunc)))
+            kwargs = {}
+        elif func is None:
+            # nicer error message
+            raise TypeError("Must provide 'func' or tuples of "
+                            "'(column, aggfunc).")
+
         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
         if how is None:
             return result
@@ -179,6 +205,10 @@ def aggregate(self, func, *args, **kwargs):
             self._insert_inaxis_grouper_inplace(result)
             result.index = np.arange(len(result))
 
+        if relabeling:
+            result = result[order]
+            result.columns = columns
+
         return result._convert(datetime=True)
 
     agg = aggregate
@@ -791,11 +821,8 @@ def _aggregate_multiple_funcs(self, arg, _level):
             # list of functions / function names
             columns = []
             for f in arg:
-                if isinstance(f, str):
-                    columns.append(f)
-                else:
-                    # protect against callables without names
-                    columns.append(com.get_callable_name(f))
+                columns.append(_get_agg_name(f))
+
             arg = zip(columns, arg)
 
         results = OrderedDict()
@@ -1296,6 +1323,16 @@ class DataFrameGroupBy(NDFrameGroupBy):
     A
     1   1   2  0.590716
     2   3   4  0.704907
+
+    To control the output names with different aggregations
+    per column, pass tuples of ``(column, aggfunc))`` as kwargs
+
+    >>> df.groupby("A").agg(b_min=("B", "min"), c_sum=("C", "sum"))
+    >>>
+           b_min     c_sum
+    A
+    1      1  0.825627
+    2      3  2.218618
     """)
 
     @Substitution(see_also=_agg_see_also_doc,
@@ -1304,7 +1341,7 @@ class DataFrameGroupBy(NDFrameGroupBy):
                   klass='DataFrame',
                   axis='')
     @Appender(_shared_docs['aggregate'])
-    def aggregate(self, arg, *args, **kwargs):
+    def aggregate(self, arg=None, *args, **kwargs):
         return super().aggregate(arg, *args, **kwargs)
 
     agg = aggregate
@@ -1577,3 +1614,48 @@ def groupby_series(obj, col=None):
         return results
 
     boxplot = boxplot_frame_groupby
+
+
+def _is_multi_agg_with_relabel(**kwargs):
+    """
+    Check whether the kwargs pass to .agg look like multi-agg with relabling.
+
+    Parameters
+    ----------
+    **kwargs : dict
+
+    Returns
+    -------
+    bool
+
+    Examples
+    --------
+    >>> _is_multi_agg_with_relabel(a='max')
+    False
+    >>> _is_multi_agg_with_relabel(a_max=('a', 'max'),
+    ...                            a_min=('a', 'min'))
+    True
+    >>> _is_multi_agg_with_relabel()
+    """
+    return all(
+        isinstance(v, tuple) and len(v) == 2
+        for v in kwargs.values()
+    ) and kwargs
+
+
+def _get_agg_name(arg):
+    """
+
+    Parameters
+    ----------
+    arg
+
+    Returns
+    -------
+
+    """
+    if isinstance(arg, str):
+        return arg
+    else:
+        # protect against callables without names
+        return com.get_callable_name(arg)
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -7,7 +7,7 @@
 import pytest
 
 import pandas as pd
-from pandas import DataFrame, Index, MultiIndex, Series, concat
+from pandas import DataFrame, Index, MultiIndex, Series, concat, compat
 from pandas.core.base import SpecificationError
 from pandas.core.groupby.grouper import Grouping
 import pandas.util.testing as tm
@@ -313,3 +313,79 @@ def test_order_aggregate_multiple_funcs():
     expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min'])
 
     tm.assert_index_equal(result, expected)
+
+
+class TestKeywordAggregation:
+
+    def test_agg_relabel(self):
+        df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'],
+                           "A": [0, 1, 2, 3],
+                           "B": [5, 6, 7, 8]})
+        result = df.groupby("group").agg(
+            a_max=("A", "max"),
+            b_max=("B", "max"),
+        )
+        expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]},
+                                index=pd.Index(['a', 'b'], name='group'),
+                                columns=['a_max', 'b_max'])
+        tm.assert_frame_equal(result, expected)
+
+        # order invariance
+        result = df.groupby('group').agg(
+            b_min=("B", "min"),
+            a_min=("A", min),
+            a_max=("A", "max"),
+            b_max=("B", "max"),
+        )
+        expected = pd.DataFrame({"b_min": [5, 7],
+                                 "a_min": [0, 2],
+                                 "a_max": [1, 3],
+                                 "b_max": [6, 8]},
+                                index=pd.Index(['a', 'b'], name='group'),
+                                columns=['b_min', 'a_min', 'a_max', 'b_max'])
+        if not compat.PY36:
+            expected = expected[['a_max', 'a_min', 'b_max', 'b_min']]
+        tm.assert_frame_equal(result, expected)
+
+    def test_agg_relabel_non_identifier(self):
+        df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'],
+                           "A": [0, 1, 2, 3],
+                           "B": [5, 6, 7, 8]})
+
+        result = df.groupby("group").agg(**{'my col': ('A', 'max')})
+        expected = pd.DataFrame({'my col': [1, 3]},
+                                index=pd.Index(['a', 'b'], name='group'))
+        tm.assert_frame_equal(result, expected)
+
+    def test_duplicate_raises(self):
+        # TODO: we currently raise on multiple lambdas. We could *maybe*
+        # update com.get_callable_name to append `_i` to each lambda.
+        df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+        with pytest.raises(SpecificationError, match="Function names"):
+            df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
+
+    def test_agg_relabel_with_level(self):
+        df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
+                          index=pd.MultiIndex.from_product([['A', 'B'],
+                                                            ['a', 'b']]))
+        result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'),
+                                         cc=('B', 'mean'))
+        expected = pd.DataFrame({
+            'aa': [0, 1],
+            'bb': [0, 1],
+            'cc': [1.5, 3.5]
+        }, index=['A', 'B'])
+        tm.assert_frame_equal(result, expected)
+
+    def test_agg_relabel_other_raises(self):
+        df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
+        grouped = df.groupby("A")
+        match = 'Must provide'
+        with pytest.raises(TypeError, match=match):
+            grouped.agg(foo=1)
+
+        with pytest.raises(TypeError, match=match):
+            grouped.agg()
+
+        with pytest.raises(TypeError, match=match):
+            grouped.agg(a=('B', 'max'), b=(1, 2, 3))