diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index a508e84465107..ef558381c5e6f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -453,6 +453,14 @@ the original values: np.asarray(cat) > base +When you compare two unordered categoricals with the same categories, the order is not considered: + +.. ipython:: python + + c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 == c2 + Operations ---------- diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 7773f5abfb0ba..be4cf85606935 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -85,7 +85,10 @@ Numeric ^^^^^^^ +Categorical +^^^^^^^^^^^ +- Fixed comparison operations considering the order of the categories when both categoricals are unordered (:issue:`16014`) Other ^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index edbb07b7069e9..5b663f1d85ee7 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -55,17 +55,31 @@ def f(self, other): "equality or not") if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are - # the same - if ((len(self.categories) != len(other.categories)) or - not ((self.categories == other.categories).all())): - raise TypeError("Categoricals can only be compared if " - "'categories' are the same") + # the same (maybe up to ordering, depending on ordered) + + msg = ("Categoricals can only be compared if " + "'categories' are the same.") + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif (self.ordered and not (self.categories == + other.categories).all()): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): + raise TypeError(msg) + if not (self.ordered == other.ordered): raise TypeError("Categoricals can only be compared if " "'ordered' is the same") - na_mask = (self._codes == -1) | (other._codes == -1) + if not self.ordered and not self.categories.equals( + other.categories): + # both unordered and different order + other_codes = _get_codes_for_values(other, self.categories) + else: + other_codes = other._codes + + na_mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) - ret = f(other._codes) + ret = f(other_codes) if na_mask.any(): # In other series, the leads to False, so do that here too ret[na_mask] = False diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 57676be68bedf..f48eea23220b8 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3821,6 +3821,43 @@ def test_cat_equality(self): pytest.raises(TypeError, lambda: a > b) pytest.raises(TypeError, lambda: b > a) + @pytest.mark.parametrize('ctor', [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ]) + def test_unordered_different_order_equal(self, ctor): + # https://github.com/pandas-dev/pandas/issues/16014 + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 == c2).all() + + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + result = c1 == c2 + tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) + + def test_unordered_different_categories_raises(self): + c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) + with tm.assert_raises_regex(TypeError, + "Categoricals can only be compared"): + c1 == c2 + + def test_compare_different_lengths(self): + c1 = Categorical([], categories=['a', 'b']) + c2 = Categorical([], categories=['a']) + msg = "Categories are different lengths" + with tm.assert_raises_regex(TypeError, msg): + c1 == c2 + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2]