Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Categorical comparison with unordered #16339

Merged
merged 1 commit into from
May 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,14 @@ the original values:
np.asarray(cat) > base
When you compare two unordered categoricals with the same categories, the order is not considered:

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

versionadded tag

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think not necessary since it's a bugfix.

.. ipython:: python
c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False)
c1 == c2
Operations
----------

Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ Numeric
^^^^^^^


Categorical
^^^^^^^^^^^

- Fixed comparison operations considering the order of the categories when both categoricals are unordered (:issue:`16014`)

Other
^^^^^
28 changes: 21 additions & 7 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,31 @@ def f(self, other):
"equality or not")
if isinstance(other, Categorical):
# Two Categoricals can only be be compared if the categories are
# the same
if ((len(self.categories) != len(other.categories)) or
not ((self.categories == other.categories).all())):
raise TypeError("Categoricals can only be compared if "
"'categories' are the same")
# the same (maybe up to ordering, depending on ordered)

msg = ("Categoricals can only be compared if "
"'categories' are the same.")
if len(self.categories) != len(other.categories):
raise TypeError(msg + " Categories are different lengths")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you are including the origninal message here, but it its a little awkward

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What did you have in mind? I like how it's currently "Categoricals can only be compared if 'categories' are the same. Categories are different lengths." Since it's the general problem (different categories) and a specific hint
as to what's wrong (different lengths)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually this is fine.

elif (self.ordered and not (self.categories ==
other.categories).all()):
raise TypeError(msg)
elif not set(self.categories) == set(other.categories):
raise TypeError(msg)

if not (self.ordered == other.ordered):
raise TypeError("Categoricals can only be compared if "
"'ordered' is the same")
na_mask = (self._codes == -1) | (other._codes == -1)
if not self.ordered and not self.categories.equals(
other.categories):
# both unordered and different order
other_codes = _get_codes_for_values(other, self.categories)
else:
other_codes = other._codes

na_mask = (self._codes == -1) | (other_codes == -1)
f = getattr(self._codes, op)
ret = f(other._codes)
ret = f(other_codes)
if na_mask.any():
# In other series, the leads to False, so do that here too
ret[na_mask] = False
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3821,6 +3821,43 @@ def test_cat_equality(self):
pytest.raises(TypeError, lambda: a > b)
pytest.raises(TypeError, lambda: b > a)

@pytest.mark.parametrize('ctor', [
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
])
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
assert (c1 == c2).all()

c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
assert (c1 != c2).all()

c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
assert (c1 != c2).all()

c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))

def test_unordered_different_categories_raises(self):
c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
with tm.assert_raises_regex(TypeError,
"Categoricals can only be compared"):
c1 == c2

def test_compare_different_lengths(self):
c1 = Categorical([], categories=['a', 'b'])
c2 = Categorical([], categories=['a'])
msg = "Categories are different lengths"
with tm.assert_raises_regex(TypeError, msg):
c1 == c2

def test_concat_append(self):
cat = pd.Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
Expand Down