From 9921b68a41567018607bcbdee41096ead67ae91a Mon Sep 17 00:00:00 2001 From: peterpanmj Date: Fri, 25 May 2018 15:30:23 +0800 Subject: [PATCH] BUG: make dense ranks results scale to 100 percent (#20731) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/_libs/groupby_helper.pxi.in | 15 +++++++++++---- pandas/tests/groupby/test_rank.py | 14 +++++++------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index b94377af770f4d..5c03abbbca0fd7 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -111,7 +111,7 @@ Offsets Numeric ^^^^^^^ -- +- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - - diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6a33e4a09476d4..82a56c50a196cb 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): """ - Provides the rank of values within each group. + Provides the rank of values within each group. Parameters ---------- @@ -452,7 +452,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, cdef: TiebreakEnumType tiebreak Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t] _as ndarray[float64_t, ndim=2] grp_sizes ndarray[{{c_type}}] masked_vals @@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 + grp_tie_count +=1 # Similar to the previous conditional, check now if we are moving # to a new group. If so, keep track of the index where the new @@ -571,10 +572,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # (used by pct calculations later). also be sure to reset any of # the items helping to calculate dups if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count + - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 + grp_tie_count = 0 val_start = i + 1 grp_start = i + 1 grp_vals_seen = 1 diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 6ad8b4905abff6..203c3c73bec944 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -59,9 +59,9 @@ def test_rank_apply(): ('first', False, False, [3., 4., 1., 5., 2.]), ('first', False, True, [.6, .8, .2, 1., .4]), ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]), ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [.6, .6, .2, .6, .4]), + ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]), ]) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) @@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06'), np.nan, np.nan] @@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): ('dense', True, 'keep', False, [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), ('dense', True, 'keep', True, - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]), ('dense', False, 'keep', False, [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), ('dense', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]), ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), ('average', True, 'no_na', True, [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), @@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), ('dense', True, 'no_na', True, - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]), ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), ('dense', False, 'no_na', True, - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.]) ]) def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):