diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cbb4d32cc5edb1..76666a06500b65 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1113,3 +1113,4 @@ Other - Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`) - Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`) - Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`) +- Bug in ``DataFrame.rank()`` and ``Series.rank()`` when ``method='dense'`` and ``pct=True`` (:issue:`15630`) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index aafffbf60f638f..73b8c494ee14f3 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -198,7 +198,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, sum_ranks = dups = 0 {{endif}} if pct: - return ranks / count + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count else: return ranks @@ -370,7 +373,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if pct: - ranks[i, :] /= count + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count if axis == 0: return ranks.T else: diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index b115218d769582..7d6cd351344879 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -218,6 +218,14 @@ def test_rank_methods_frame(self): expected = expected.astype('float64') tm.assert_frame_equal(result, expected) + def test_rank_dense_(self): + df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]]) + result = df.rank(method='dense', pct=True) + expected = DataFrame([[1., 1., 1.], + [1., 0.5, 2. / 3], + [1., 0.5, 1. / 3]]) + assert_frame_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index f47eae3adc3aef..ad1c1f84a48f8c 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -287,6 +287,25 @@ def test_rank_dense_method(self): expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) + def test_rank_dense_(self): + # GH15630, pct should be on 100% basis even when method='dense' + in_out = [([1], [1.]), + ([2], [1.]), + ([0], [1.]), + ([2, 2], [1., 1.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), + ([-5, -4, -3, -2, -1], + [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py new file mode 100644 index 00000000000000..fb67f15bee0d23 --- /dev/null +++ b/pandas/tests/test_stats.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +from pandas import compat + +from distutils.version import LooseVersion +from numpy import nan +import numpy as np + +from pandas import Series, DataFrame + +from pandas.compat import product +from pandas.util.testing import (assert_frame_equal, assert_series_equal) +import pandas.util.testing as tm + + +class TestRank(tm.TestCase): + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + df = DataFrame({'A': s, 'B': s}) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + } + + def test_rank_tie_methods(self): + s = self.s + + def _check(s, expected, method='average'): + result = s.rank(method=method) + tm.assert_series_equal(result, Series(expected)) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, dtype in product(results, dtypes): + if (dtype, method) in disabled: + continue + series = s if dtype is None else s.astype(dtype) + _check(series, results[method], method=method) + + def test_rank_methods_series(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + import scipy + from scipy.stats import rankdata + + xs = np.random.randn(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates + np.random.shuffle(xs) + + index = [chr(ord('a') + i) for i in range(len(xs))] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + ts = Series(vals, index=index) + + for m in ['average', 'min', 'max', 'first', 'dense']: + result = ts.rank(method=m) + sprank = rankdata(vals, m if m != 'first' else 'ordinal') + expected = Series(sprank, index=index) + + if LooseVersion(scipy.__version__) >= '0.17.0': + expected = expected.astype('float64') + tm.assert_series_equal(result, expected) + + def test_rank_methods_frame(self): + tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') + import scipy + from scipy.stats import rankdata + + xs = np.random.randint(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord('z') - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + for ax in [0, 1]: + for m in ['average', 'min', 'max', 'first', 'dense']: + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis( + rankdata, ax, vals, + m if m != 'first' else 'ordinal') + sprank = sprank.astype(np.float64) + expected = DataFrame(sprank, columns=cols) + + if LooseVersion(scipy.__version__) >= '0.17.0': + expected = expected.astype('float64') + tm.assert_frame_equal(result, expected) + + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1],), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + # GH15630, pct should be on 100% basis even when method='dense' + in_out = [([1], [1.]), + ([2], [1.]), + ([0], [1.]), + ([2, 2], [1., 1.1]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), + ([-5, -4, -3, -2, -1], + [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]]) + result = df.rank(method='dense', pct=True) + expected = DataFrame([[1., 1., 1.], + [1., 0.5, 2. / 3], + [1., 0.5, 1. / 3]]) + assert_frame_equal(result, expected) + + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + s = self.s.dropna() + df = self.df.dropna() + else: + s = self.s.astype(dtype) + df = self.df.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + assert_series_equal(res, expected) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + assert_series_equal(res2, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) + + def test_rank_2d_tie_methods(self): + df = self.df + + def _check2d(df, expected, method='average', axis=0): + exp_df = DataFrame({'A': expected, 'B': expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + assert_frame_equal(result, exp_df) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, axis, dtype in product(results, [0, 1], dtypes): + if (dtype, method) in disabled: + continue + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, results[method], method=method, axis=axis) + + def test_rank_int(self): + s = self.s.dropna().astype('i8') + + for method, res in compat.iteritems(self.results): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + assert_series_equal(result, expected) + + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False)