From 44929796567593d05d497896dce83c66e43c1f12 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Nov 2020 14:39:26 -0800 Subject: [PATCH] REF: Categorical.is_dtype_equal -> categories_match_up_to_permutation (#37545) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/categorical.py | 20 ++++++--- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/reshape/merge.py | 2 +- .../tests/arrays/categorical/test_dtypes.py | 45 ++++++++++++------- pandas/tests/reshape/merge/test_merge.py | 6 +-- 7 files changed, 50 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f137302d4994..8a092cb6e36db 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -340,6 +340,7 @@ Deprecations - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 263512e427c69..b1f913e9ea641 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -78,7 +78,7 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if not self.is_dtype_equal(other): + if not self._categories_match_up_to_permutation(other): raise TypeError(msg) if not self.ordered and not self.categories.equals(other.categories): @@ -1869,11 +1869,12 @@ def _validate_setitem_value(self, value): # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self, value): + if not is_dtype_equal(self.dtype, value.dtype): raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) + # is_dtype_equal implies categories_match_up_to_permutation new_codes = self._validate_listlike(value) value = Categorical.from_codes(new_codes, dtype=self.dtype) @@ -2107,7 +2108,7 @@ def equals(self, other: object) -> bool: """ if not isinstance(other, Categorical): return False - elif self.is_dtype_equal(other): + elif self._categories_match_up_to_permutation(other): other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False @@ -2120,7 +2121,7 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ - def is_dtype_equal(self, other): + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered @@ -2133,8 +2134,17 @@ def is_dtype_equal(self, other): ------- bool """ + return hash(self.dtype) == hash(other.dtype) + + def is_dtype_equal(self, other) -> bool: + warn( + "Categorical.is_dtype_equal is deprecated and will be removed " + "in a future version", + FutureWarning, + stacklevel=2, + ) try: - return hash(self.dtype) == hash(other.dtype) + return self._categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 60fd959701821..99dc01ef421d1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -296,7 +296,7 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first.is_dtype_equal(other) for other in to_union[1:]): + if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2f2836519d847..8cbd0d83c78d7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -255,7 +255,7 @@ def _is_dtype_compat(self, other) -> Categorical: """ if is_categorical_dtype(other): other = extract_array(other) - if not other.is_dtype_equal(self): + if not other._categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5012be593820e..d82b1474ff3e0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1083,7 +1083,7 @@ def _maybe_coerce_merge_keys(self): # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk.is_dtype_equal(rk): + if lk._categories_match_up_to_permutation(rk): continue elif lk_is_cat or rk_is_cat: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 47ce9cb4089f9..deafa22a6e8eb 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -8,34 +8,45 @@ class TestCategoricalDtypes: - def test_is_equal_dtype(self): + def test_is_dtype_equal_deprecated(self): + # GH#37545 + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + + with tm.assert_produces_warning(FutureWarning): + c1.is_dtype_equal(c1) + + def test_categories_match_up_to_permutation(self): # test dtype comparisons between cats c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) - assert c1.is_dtype_equal(c1) - assert c2.is_dtype_equal(c2) - assert c3.is_dtype_equal(c3) - assert c1.is_dtype_equal(c2) - assert not c1.is_dtype_equal(c3) - assert not c1.is_dtype_equal(Index(list("aabca"))) - assert not c1.is_dtype_equal(c1.astype(object)) - assert c1.is_dtype_equal(CategoricalIndex(c1)) - assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) - assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) + assert c1._categories_match_up_to_permutation(c1) + assert c2._categories_match_up_to_permutation(c2) + assert c3._categories_match_up_to_permutation(c3) + assert c1._categories_match_up_to_permutation(c2) + assert not c1._categories_match_up_to_permutation(c3) + assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) + assert not c1._categories_match_up_to_permutation(c1.astype(object)) + assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) + assert c1._categories_match_up_to_permutation( + CategoricalIndex(c1, categories=list("cab")) + ) + assert not c1._categories_match_up_to_permutation( + CategoricalIndex(c1, ordered=True) + ) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) - assert c1.is_dtype_equal(s1) - assert c2.is_dtype_equal(s2) - assert c3.is_dtype_equal(s3) - assert c1.is_dtype_equal(s2) - assert not c1.is_dtype_equal(s3) - assert not c1.is_dtype_equal(s1.astype(object)) + assert c1._categories_match_up_to_permutation(s1) + assert c2._categories_match_up_to_permutation(s2) + assert c3._categories_match_up_to_permutation(s3) + assert c1._categories_match_up_to_permutation(s2) + assert not c1._categories_match_up_to_permutation(s3) + assert not c1._categories_match_up_to_permutation(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index bb2860b88b288..a58372040c7f3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right): tm.assert_series_equal(result, expected) # categories are preserved - assert left.X.values.is_dtype_equal(merged.X.values) - assert right.Z.values.is_dtype_equal(merged.Z.values) + assert left.X.values._categories_match_up_to_permutation(merged.X.values) + assert right.Z.values._categories_match_up_to_permutation(merged.Z.values) @pytest.mark.parametrize( "change", @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values.is_dtype_equal(right.X.values) + # assert not left.X.values._categories_match_up_to_permutation(right.X.values) merged = pd.merge(left, right, on="X", how=join_type)