Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Categorical.is_dtype_equal -> categories_match_up_to_permutation #37545

Merged
merged 9 commits into from
Nov 2, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ Deprecations
- Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`)
- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)
- Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`)
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, use :meth:`Categorical.categories_match_up_to_permutation` instead (:issue:`37545`)

.. ---------------------------------------------------------------------------

Expand Down
22 changes: 16 additions & 6 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def func(self, other):
# the same (maybe up to ordering, depending on ordered)

msg = "Categoricals can only be compared if 'categories' are the same."
if not self.is_dtype_equal(other):
if not self.categories_match_up_to_permutation(other):
raise TypeError(msg)

if not self.ordered and not self.categories.equals(other.categories):
Expand Down Expand Up @@ -1709,7 +1709,7 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
if self.categories.equals(target.categories):
# We use the same codes, so can go directly to the engine
codes = target.codes
elif self.is_dtype_equal(target):
elif self.categories_match_up_to_permutation(target):
# We have the same categories up to a reshuffling of codes.
codes = recode_for_categories(
target.codes, target.categories, self.categories
Expand Down Expand Up @@ -1882,11 +1882,12 @@ def _validate_setitem_value(self, value):

# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self, value):
if not is_dtype_equal(self.dtype, value.dtype):
raise ValueError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# is_dtype_equal implies categories_match_up_to_permutation
new_codes = self._validate_listlike(value)
value = Categorical.from_codes(new_codes, dtype=self.dtype)

Expand Down Expand Up @@ -2120,7 +2121,7 @@ def equals(self, other: object) -> bool:
"""
if not isinstance(other, Categorical):
return False
elif self.is_dtype_equal(other):
elif self.categories_match_up_to_permutation(other):
other_codes = self._validate_listlike(other)
return np.array_equal(self._codes, other_codes)
return False
Expand All @@ -2133,7 +2134,7 @@ def _concat_same_type(self, to_concat):

# ------------------------------------------------------------------

def is_dtype_equal(self, other):
def categories_match_up_to_permutation(self, other: "Categorical") -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umm, why this name? what is wrong with the existing one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is wrong with the existing one?

is_dtype_equal(self, other) means something different from self.is_dtype_equal(other) which i find confusing

why this name?

it accurately describes what is being checked

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't have a problem with this as a private method, and its ok to deprecate the .is_dtype_equal but we shouldn't offer this as a replacement.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

privatized+green

"""
Returns True if categoricals are the same dtype
same categories, and same ordered
Expand All @@ -2146,8 +2147,17 @@ def is_dtype_equal(self, other):
-------
bool
"""
return hash(self.dtype) == hash(other.dtype)

def is_dtype_equal(self, other) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should deprecate any is_dtype_equal methods on any other Array/Index (not sure if we have any)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's just a CategoricalDtype.is_dtype_equal left. ill be happy to see that go

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, no, got rid of that a while ago

warn(
"Categorical.is_dtype_equal is deprecated and will be removed "
"in a future version, use categories_match_up_to_permutation instead",
FutureWarning,
stacklevel=2,
)
try:
return hash(self.dtype) == hash(other.dtype)
return self.categories_match_up_to_permutation(other)
except (AttributeError, TypeError):
return False

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def _maybe_unwrap(x):
raise TypeError("dtype of categories must be the same")

ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
if all(first.categories_match_up_to_permutation(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def _is_dtype_compat(self, other) -> Categorical:
"""
if is_categorical_dtype(other):
other = extract_array(other)
if not other.is_dtype_equal(self):
if not other.categories_match_up_to_permutation(self):
raise TypeError(
"categories must match existing categories when appending"
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,7 @@ def _maybe_coerce_merge_keys(self):
# if either left or right is a categorical
# then the must match exactly in categories & ordered
if lk_is_cat and rk_is_cat:
if lk.is_dtype_equal(rk):
if lk.categories_match_up_to_permutation(rk):
continue

elif lk_is_cat or rk_is_cat:
Expand Down
45 changes: 28 additions & 17 deletions pandas/tests/arrays/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,45 @@


class TestCategoricalDtypes:
def test_is_equal_dtype(self):
def test_is_dtype_equal_deprecated(self):
# GH#37545
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)

with tm.assert_produces_warning(FutureWarning):
c1.is_dtype_equal(c1)

def test_categories_match_up_to_permutation(self):

# test dtype comparisons between cats

c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1.is_dtype_equal(c1)
assert c2.is_dtype_equal(c2)
assert c3.is_dtype_equal(c3)
assert c1.is_dtype_equal(c2)
assert not c1.is_dtype_equal(c3)
assert not c1.is_dtype_equal(Index(list("aabca")))
assert not c1.is_dtype_equal(c1.astype(object))
assert c1.is_dtype_equal(CategoricalIndex(c1))
assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab")))
assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
assert c1.categories_match_up_to_permutation(c1)
assert c2.categories_match_up_to_permutation(c2)
assert c3.categories_match_up_to_permutation(c3)
assert c1.categories_match_up_to_permutation(c2)
assert not c1.categories_match_up_to_permutation(c3)
assert not c1.categories_match_up_to_permutation(Index(list("aabca")))
assert not c1.categories_match_up_to_permutation(c1.astype(object))
assert c1.categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1.categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1.categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)

# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1.is_dtype_equal(s1)
assert c2.is_dtype_equal(s2)
assert c3.is_dtype_equal(s3)
assert c1.is_dtype_equal(s2)
assert not c1.is_dtype_equal(s3)
assert not c1.is_dtype_equal(s1.astype(object))
assert c1.categories_match_up_to_permutation(s1)
assert c2.categories_match_up_to_permutation(s2)
assert c3.categories_match_up_to_permutation(s3)
assert c1.categories_match_up_to_permutation(s2)
assert not c1.categories_match_up_to_permutation(s3)
assert not c1.categories_match_up_to_permutation(s1.astype(object))

def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right):
tm.assert_series_equal(result, expected)

# categories are preserved
assert left.X.values.is_dtype_equal(merged.X.values)
assert right.Z.values.is_dtype_equal(merged.Z.values)
assert left.X.values.categories_match_up_to_permutation(merged.X.values)
assert right.Z.values.categories_match_up_to_permutation(merged.Z.values)

@pytest.mark.parametrize(
"change",
Expand All @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
X = change(right.X.astype("object"))
right = right.assign(X=X)
assert is_categorical_dtype(left.X.values.dtype)
# assert not left.X.values.is_dtype_equal(right.X.values)
# assert not left.X.values.categories_match_up_to_permutation(right.X.values)

merged = pd.merge(left, right, on="X", how=join_type)

Expand Down