From 52cffa3b3b2a510c30ed7f8cc8525c03d62e9130 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 9 Mar 2018 18:06:43 -0800 Subject: [PATCH] Cythonized GroupBy pct_change (#19919) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/groupby.py | 24 +++++++ pandas/tests/groupby/test_groupby.py | 55 ---------------- pandas/tests/groupby/test_transform.py | 87 ++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bea897e1b88e6..3afd9cff10e86 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -795,6 +795,7 @@ Performance Improvements - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) .. _whatsnew_0230.docs: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6b10d2ca3b5b2..285c5786b532b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2044,6 +2044,23 @@ def shift(self, periods=1, freq=None, axis=0): result_is_index=True, periods=periods) + @Substitution(name='groupby') + @Appender(_doc_template) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + axis=0): + """Calcuate pct_change of each value to previous entry in group""" + if freq is not None or axis != 0: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) + + filled = getattr(self, fill_method)(limit=limit).drop( + self.grouper.names, axis=1) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + @Substitution(name='groupby') @Appender(_doc_template) def head(self, n=5): @@ -3884,6 +3901,13 @@ def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): + """Calculate percent change of each value to previous entry in group""" + filled = getattr(self, fill_method)(limit=limit) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0561b3a1d8592..be0c32cefa6ff 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2062,61 +2062,6 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) - @pytest.mark.parametrize("mix_groupings", [True, False]) - @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) - @pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) - ]) - def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): - vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] - _exp_vals = list(exp_vals) - # Overwrite placeholder values - for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': - _exp_vals[index] = val1 - elif exp_val == 'val2': - _exp_vals[index] = val2 - - # Need to modify values and expectations depending on the - # Series / DataFrame that we ultimately want to generate - if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) - - def interweave(list_obj): - temp = list() - for x in list_obj: - temp.extend([x, x]) - - return temp - - _exp_vals = interweave(_exp_vals) - vals = interweave(vals) - else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) - _exp_vals = _exp_vals * 2 - vals = vals * 2 - - df = DataFrame({'key': keys, 'val': vals}) - if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') - assert_series_equal(result, exp) - else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': _exp_vals}) - assert_frame_equal(result, exp) - @pytest.mark.parametrize("agg_func", ['any', 'all']) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("vals", [ diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index b418bb0c5fea6..bce38b8cf9eed 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -636,3 +636,90 @@ def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func): exp = exp.astype('float') comp_func(result, exp) + + @pytest.mark.parametrize("mix_groupings", [True, False]) + @pytest.mark.parametrize("as_series", [True, False]) + @pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) + @pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) + ]) + def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + + @pytest.mark.parametrize("test_series", [True, False]) + @pytest.mark.parametrize("periods,fill_method,limit", [ + (1, 'ffill', None), (1, 'ffill', 1), + (1, 'bfill', None), (1, 'bfill', 1), + (-1, 'ffill', None), (-1, 'ffill', 1), + (-1, 'bfill', None), (-1, 'bfill', 1)]) + def test_pct_change(self, test_series, periods, fill_method, limit): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + exp_vals = Series(vals).pct_change(periods=periods, + fill_method=fill_method, + limit=limit).tolist() + + df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + 'vals': vals * 2}) + grp = df.groupby('key') + + def get_result(grp_obj): + return grp_obj.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + + if test_series: + exp = pd.Series(exp_vals * 2) + exp.name = 'vals' + grp = grp['vals'] + result = get_result(grp) + tm.assert_series_equal(result, exp) + else: + exp = DataFrame({'vals': exp_vals * 2}) + result = get_result(grp) + tm.assert_frame_equal(result, exp)