Skip to content

Commit

Permalink
BUG: Dense ranking with percent now uses 100% basis
Browse files Browse the repository at this point in the history
- `DataFrame.rank()` and `Series.rank()` when `method='dense'` and
  `pct=True` now scales to 100%.

See pandas-dev#15630
  • Loading branch information
rouzazari authored and gfyoung committed Mar 2, 2018
1 parent 5f271eb commit 0421dc5
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1731,3 +1731,4 @@ Other
- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`)
- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
- Bug in ``DataFrame.rank()`` and ``Series.rank()`` when ``method='dense'`` and ``pct=True`` (:issue:`15630`)
10 changes: 8 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
sum_ranks = dups = 0
{{endif}}
if pct:
return ranks / count
if tiebreak == TIEBREAK_DENSE:
return ranks / total_tie_count
else:
return ranks / count
else:
return ranks

Expand Down Expand Up @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0
if pct:
ranks[i, :] /= count
if tiebreak == TIEBREAK_DENSE:
ranks[i, :] /= total_tie_count
else:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,14 @@ def test_rank_methods_frame(self):
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)

def test_rank_dense_(self):
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
result = df.rank(method='dense', pct=True)
expected = DataFrame([[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]])
assert_frame_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,25 @@ def test_rank_dense_method(self):
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

def test_rank_dense_(self):
# GH15630, pct should be on 100% basis even when method='dense'
in_out = [([1], [1.]),
([2], [1.]),
([0], [1.]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([-5, -4, -3, -2, -1],
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

Expand Down
210 changes: 210 additions & 0 deletions pandas/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# -*- coding: utf-8 -*-
from pandas import compat

from distutils.version import LooseVersion
from numpy import nan
import numpy as np

from pandas import Series, DataFrame

from pandas.compat import product
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
import pandas.util.testing as tm


class TestRank(tm.TestCase):
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
df = DataFrame({'A': s, 'B': s})

results = {
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
3.5, 1.5, 8.0, nan, 5.5]),
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
}

def test_rank_tie_methods(self):
s = self.s

def _check(s, expected, method='average'):
result = s.rank(method=method)
tm.assert_series_equal(result, Series(expected))

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, dtype in product(results, dtypes):
if (dtype, method) in disabled:
continue
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

def test_rank_methods_series(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata

xs = np.random.randn(9)
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
np.random.shuffle(xs)

index = [chr(ord('a') + i) for i in range(len(xs))]

for vals in [xs, xs + 1e6, xs * 1e-6]:
ts = Series(vals, index=index)

for m in ['average', 'min', 'max', 'first', 'dense']:
result = ts.rank(method=m)
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
expected = Series(sprank, index=index)

if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_series_equal(result, expected)

def test_rank_methods_frame(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata

xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)

for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals,
m if m != 'first' else 'ordinal')
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols)

if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)

def test_rank_dense_method(self):
dtypes = ['O', 'f8', 'i8']
in_out = [([1], [1]),
([2], [1]),
([0], [1]),
([2, 2], [1, 1]),
([1, 2, 3], [1, 2, 3]),
([4, 2, 1], [3, 2, 1],),
([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense')
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

# GH15630, pct should be on 100% basis even when method='dense'
in_out = [([1], [1.]),
([2], [1.]),
([0], [1.]),
([2, 2], [1., 1.1]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([-5, -4, -3, -2, -1],
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
result = df.rank(method='dense', pct=True)
expected = DataFrame([[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]])
assert_frame_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
s = self.s.dropna()
df = self.df.dropna()
else:
s = self.s.astype(dtype)
df = self.df.astype(dtype)

res = s.rank(ascending=False)
expected = (s.max() - s).rank()
assert_series_equal(res, expected)

res = df.rank(ascending=False)
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
assert_series_equal(res2, expected)

expected = (df.max() - df).rank(method=method)

if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)

res3 = df.rank(method=method, ascending=False,
numeric_only=False)
assert_frame_equal(res3, expected)

def test_rank_2d_tie_methods(self):
df = self.df

def _check2d(df, expected, method='average', axis=0):
exp_df = DataFrame({'A': expected, 'B': expected})

if axis == 1:
df = df.T
exp_df = exp_df.T

result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, axis, dtype in product(results, [0, 1], dtypes):
if (dtype, method) in disabled:
continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)

def test_rank_int(self):
s = self.s.dropna().astype('i8')

for method, res in compat.iteritems(self.results):
result = s.rank(method=method)
expected = Series(res).dropna()
expected.index = result.index
assert_series_equal(result, expected)

def test_rank_object_bug(self):
# GH 13445

# smoke tests
Series([np.nan] * 32).astype(object).rank(ascending=True)
Series([np.nan] * 32).astype(object).rank(ascending=False)

0 comments on commit 0421dc5

Please sign in to comment.