Skip to content

Commit

Permalink
BUG: Categorical comparison with unordered (#16339)
Browse files Browse the repository at this point in the history
Fixes categorical comparison operations improperly considering
ordering when two unordered categoricals are compared.

Closes #16014
(cherry picked from commit 91e9e52)
  • Loading branch information
TomAugspurger committed May 30, 2017
1 parent bf7ceec commit 4a93245
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 7 deletions.
8 changes: 8 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,14 @@ the original values:
np.asarray(cat) > base
When you compare two unordered categoricals with the same categories, the order is not considered:

.. ipython:: python
c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False)
c1 == c2
Operations
----------

Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ Numeric
^^^^^^^


Categorical
^^^^^^^^^^^

- Fixed comparison operations considering the order of the categories when both categoricals are unordered (:issue:`16014`)

Other
^^^^^
28 changes: 21 additions & 7 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,31 @@ def f(self, other):
"equality or not")
if isinstance(other, Categorical):
# Two Categoricals can only be be compared if the categories are
# the same
if ((len(self.categories) != len(other.categories)) or
not ((self.categories == other.categories).all())):
raise TypeError("Categoricals can only be compared if "
"'categories' are the same")
# the same (maybe up to ordering, depending on ordered)

msg = ("Categoricals can only be compared if "
"'categories' are the same.")
if len(self.categories) != len(other.categories):
raise TypeError(msg + " Categories are different lengths")
elif (self.ordered and not (self.categories ==
other.categories).all()):
raise TypeError(msg)
elif not set(self.categories) == set(other.categories):
raise TypeError(msg)

if not (self.ordered == other.ordered):
raise TypeError("Categoricals can only be compared if "
"'ordered' is the same")
na_mask = (self._codes == -1) | (other._codes == -1)
if not self.ordered and not self.categories.equals(
other.categories):
# both unordered and different order
other_codes = _get_codes_for_values(other, self.categories)
else:
other_codes = other._codes

na_mask = (self._codes == -1) | (other_codes == -1)
f = getattr(self._codes, op)
ret = f(other._codes)
ret = f(other_codes)
if na_mask.any():
# In other series, the leads to False, so do that here too
ret[na_mask] = False
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3822,6 +3822,43 @@ def test_cat_equality(self):
pytest.raises(TypeError, lambda: a > b)
pytest.raises(TypeError, lambda: b > a)

@pytest.mark.parametrize('ctor', [
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
])
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
assert (c1 == c2).all()

c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
assert (c1 != c2).all()

c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
assert (c1 != c2).all()

c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))

def test_unordered_different_categories_raises(self):
c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
with tm.assert_raises_regex(TypeError,
"Categoricals can only be compared"):
c1 == c2

def test_compare_different_lengths(self):
c1 = Categorical([], categories=['a', 'b'])
c2 = Categorical([], categories=['a'])
msg = "Categories are different lengths"
with tm.assert_raises_regex(TypeError, msg):
c1 == c2

def test_concat_append(self):
cat = pd.Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
Expand Down

0 comments on commit 4a93245

Please sign in to comment.