From 6fe0c3e870b2bdfc197d1768b26755125e6a568c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Oct 2020 14:17:52 -0700 Subject: [PATCH 1/5] REF: Categorical.is_dtype_equal -> categories_match_up_to_permutation --- pandas/core/arrays/categorical.py | 22 +++++++--- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 3 +- pandas/core/reshape/merge.py | 2 +- .../tests/arrays/categorical/test_dtypes.py | 44 ++++++++++++------- pandas/tests/reshape/merge/test_merge.py | 6 +-- 6 files changed, 50 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 499bb364c48a1..1333718d71f35 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -78,7 +78,7 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if not self.is_dtype_equal(other): + if not self.categories_match_up_to_permutation(other): raise TypeError(msg) if not self.ordered and not self.categories.equals(other.categories): @@ -1709,7 +1709,7 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: if self.categories.equals(target.categories): # We use the same codes, so can go directly to the engine codes = target.codes - elif self.is_dtype_equal(target): + elif self.categories_match_up_to_permutation(target): # We have the same categories up to a reshuffling of codes. codes = recode_for_categories( target.codes, target.categories, self.categories @@ -1882,11 +1882,12 @@ def _validate_setitem_value(self, value): # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self, value): + if not is_dtype_equal(self.dtype, value.dtype): raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) + # is_dtype_equal implies categories_match_up_to_permutation new_codes = self._validate_listlike(value) value = Categorical.from_codes(new_codes, dtype=self.dtype) @@ -2120,7 +2121,7 @@ def equals(self, other: object) -> bool: """ if not isinstance(other, Categorical): return False - elif self.is_dtype_equal(other): + elif self.categories_match_up_to_permutation(other): other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False @@ -2133,7 +2134,7 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ - def is_dtype_equal(self, other): + def categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered @@ -2146,8 +2147,17 @@ def is_dtype_equal(self, other): ------- bool """ + return hash(self.dtype) == hash(other.dtype) + + def is_dtype_equal(self, other) -> bool: + warn( + "Categorical.is_dtype_equal is deprecated and will be removed " + "in a future version, use categories_match_up_to_permutation instead", + FutureWarning, + stacklevel=2, + ) try: - return hash(self.dtype) == hash(other.dtype) + return self.categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 60fd959701821..eb95aa74a7854 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -296,7 +296,7 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first.is_dtype_equal(other) for other in to_union[1:]): + if all(first.categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index fb0e710921a5f..de7d6edf54da6 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -46,6 +46,7 @@ "_reverse_indexer", "searchsorted", "is_dtype_equal", + "categories_match_up_to_permutation", "min", "max", ], @@ -256,7 +257,7 @@ def _is_dtype_compat(self, other) -> Categorical: """ if is_categorical_dtype(other): other = extract_array(other) - if not other.is_dtype_equal(self): + if not other.categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5012be593820e..e7315aafe5a0b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1083,7 +1083,7 @@ def _maybe_coerce_merge_keys(self): # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk.is_dtype_equal(rk): + if lk.categories_match_up_to_permutation(rk): continue elif lk_is_cat or rk_is_cat: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 47ce9cb4089f9..32f2d69864032 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -8,34 +8,44 @@ class TestCategoricalDtypes: - def test_is_equal_dtype(self): + def test_is_dtype_equal_deprecated(self): + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + + with tm.assert_produces_warning(FutureWarning): + c1.is_dtype_equal(c1) + + def test_categories_match_up_to_permutation(self): # test dtype comparisons between cats c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) - assert c1.is_dtype_equal(c1) - assert c2.is_dtype_equal(c2) - assert c3.is_dtype_equal(c3) - assert c1.is_dtype_equal(c2) - assert not c1.is_dtype_equal(c3) - assert not c1.is_dtype_equal(Index(list("aabca"))) - assert not c1.is_dtype_equal(c1.astype(object)) - assert c1.is_dtype_equal(CategoricalIndex(c1)) - assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) - assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) + assert c1.categories_match_up_to_permutation(c1) + assert c2.categories_match_up_to_permutation(c2) + assert c3.categories_match_up_to_permutation(c3) + assert c1.categories_match_up_to_permutation(c2) + assert not c1.categories_match_up_to_permutation(c3) + assert not c1.categories_match_up_to_permutation(Index(list("aabca"))) + assert not c1.categories_match_up_to_permutation(c1.astype(object)) + assert c1.categories_match_up_to_permutation(CategoricalIndex(c1)) + assert c1.categories_match_up_to_permutation( + CategoricalIndex(c1, categories=list("cab")) + ) + assert not c1.categories_match_up_to_permutation( + CategoricalIndex(c1, ordered=True) + ) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) - assert c1.is_dtype_equal(s1) - assert c2.is_dtype_equal(s2) - assert c3.is_dtype_equal(s3) - assert c1.is_dtype_equal(s2) - assert not c1.is_dtype_equal(s3) - assert not c1.is_dtype_equal(s1.astype(object)) + assert c1.categories_match_up_to_permutation(s1) + assert c2.categories_match_up_to_permutation(s2) + assert c3.categories_match_up_to_permutation(s3) + assert c1.categories_match_up_to_permutation(s2) + assert not c1.categories_match_up_to_permutation(s3) + assert not c1.categories_match_up_to_permutation(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index bb2860b88b288..53505253d9cdb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right): tm.assert_series_equal(result, expected) # categories are preserved - assert left.X.values.is_dtype_equal(merged.X.values) - assert right.Z.values.is_dtype_equal(merged.Z.values) + assert left.X.values.categories_match_up_to_permutation(merged.X.values) + assert right.Z.values.categories_match_up_to_permutation(merged.Z.values) @pytest.mark.parametrize( "change", @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values.is_dtype_equal(right.X.values) + # assert not left.X.values.categories_match_up_to_permutation(right.X.values) merged = pd.merge(left, right, on="X", how=join_type) From 27517a6a2ddd75b01ab6ab4235c65e49fbb95366 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Oct 2020 14:20:30 -0700 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/category.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f9e6a86e4f02d..711c95832d71d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -329,6 +329,7 @@ Deprecations - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, use :meth:`Categorical.categories_match_up_to_permutation` instead (:issue:`??`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index de7d6edf54da6..6a400475e6437 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -46,7 +46,6 @@ "_reverse_indexer", "searchsorted", "is_dtype_equal", - "categories_match_up_to_permutation", "min", "max", ], From 2bd4416a79c4a0f45b4e0d0da58c9c7f991c7744 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Oct 2020 14:21:57 -0700 Subject: [PATCH 3/5] GH ref --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/arrays/categorical/test_dtypes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 711c95832d71d..fc7fbaba519b5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -329,7 +329,7 @@ Deprecations - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) -- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, use :meth:`Categorical.categories_match_up_to_permutation` instead (:issue:`??`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, use :meth:`Categorical.categories_match_up_to_permutation` instead (:issue:`37545`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 32f2d69864032..05a9e27dc15b3 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -9,6 +9,7 @@ class TestCategoricalDtypes: def test_is_dtype_equal_deprecated(self): + # GH#37545 c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) with tm.assert_produces_warning(FutureWarning): From c153bbd55f52cc0b07135651469914152cd41d1a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 2 Nov 2020 06:33:29 -0800 Subject: [PATCH 4/5] privatize --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/categorical.py | 8 ++--- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/reshape/merge.py | 2 +- .../tests/arrays/categorical/test_dtypes.py | 32 +++++++++---------- pandas/tests/reshape/merge/test_merge.py | 6 ++-- 7 files changed, 27 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c09390ad88022..8a092cb6e36db 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -340,7 +340,7 @@ Deprecations - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) -- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, use :meth:`Categorical.categories_match_up_to_permutation` instead (:issue:`37545`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 009503c91569a..b5d69cc9065ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -78,7 +78,7 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if not self.categories_match_up_to_permutation(other): + if not self._categories_match_up_to_permutation(other): raise TypeError(msg) if not self.ordered and not self.categories.equals(other.categories): @@ -2113,7 +2113,7 @@ def equals(self, other: object) -> bool: """ if not isinstance(other, Categorical): return False - elif self.categories_match_up_to_permutation(other): + elif self._categories_match_up_to_permutation(other): other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False @@ -2144,12 +2144,12 @@ def categories_match_up_to_permutation(self, other: "Categorical") -> bool: def is_dtype_equal(self, other) -> bool: warn( "Categorical.is_dtype_equal is deprecated and will be removed " - "in a future version, use categories_match_up_to_permutation instead", + "in a future version", FutureWarning, stacklevel=2, ) try: - return self.categories_match_up_to_permutation(other) + return self._categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index eb95aa74a7854..99dc01ef421d1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -296,7 +296,7 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first.categories_match_up_to_permutation(other) for other in to_union[1:]): + if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c59f8621037af..8cbd0d83c78d7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -255,7 +255,7 @@ def _is_dtype_compat(self, other) -> Categorical: """ if is_categorical_dtype(other): other = extract_array(other) - if not other.categories_match_up_to_permutation(self): + if not other._categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e7315aafe5a0b..d82b1474ff3e0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1083,7 +1083,7 @@ def _maybe_coerce_merge_keys(self): # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk.categories_match_up_to_permutation(rk): + if lk._categories_match_up_to_permutation(rk): continue elif lk_is_cat or rk_is_cat: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 05a9e27dc15b3..deafa22a6e8eb 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -22,18 +22,18 @@ def test_categories_match_up_to_permutation(self): c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) - assert c1.categories_match_up_to_permutation(c1) - assert c2.categories_match_up_to_permutation(c2) - assert c3.categories_match_up_to_permutation(c3) - assert c1.categories_match_up_to_permutation(c2) - assert not c1.categories_match_up_to_permutation(c3) - assert not c1.categories_match_up_to_permutation(Index(list("aabca"))) - assert not c1.categories_match_up_to_permutation(c1.astype(object)) - assert c1.categories_match_up_to_permutation(CategoricalIndex(c1)) - assert c1.categories_match_up_to_permutation( + assert c1._categories_match_up_to_permutation(c1) + assert c2._categories_match_up_to_permutation(c2) + assert c3._categories_match_up_to_permutation(c3) + assert c1._categories_match_up_to_permutation(c2) + assert not c1._categories_match_up_to_permutation(c3) + assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) + assert not c1._categories_match_up_to_permutation(c1.astype(object)) + assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) + assert c1._categories_match_up_to_permutation( CategoricalIndex(c1, categories=list("cab")) ) - assert not c1.categories_match_up_to_permutation( + assert not c1._categories_match_up_to_permutation( CategoricalIndex(c1, ordered=True) ) @@ -41,12 +41,12 @@ def test_categories_match_up_to_permutation(self): s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) - assert c1.categories_match_up_to_permutation(s1) - assert c2.categories_match_up_to_permutation(s2) - assert c3.categories_match_up_to_permutation(s3) - assert c1.categories_match_up_to_permutation(s2) - assert not c1.categories_match_up_to_permutation(s3) - assert not c1.categories_match_up_to_permutation(s1.astype(object)) + assert c1._categories_match_up_to_permutation(s1) + assert c2._categories_match_up_to_permutation(s2) + assert c3._categories_match_up_to_permutation(s3) + assert c1._categories_match_up_to_permutation(s2) + assert not c1._categories_match_up_to_permutation(s3) + assert not c1._categories_match_up_to_permutation(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 53505253d9cdb..a58372040c7f3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1707,8 +1707,8 @@ def test_other_columns(self, left, right): tm.assert_series_equal(result, expected) # categories are preserved - assert left.X.values.categories_match_up_to_permutation(merged.X.values) - assert right.Z.values.categories_match_up_to_permutation(merged.Z.values) + assert left.X.values._categories_match_up_to_permutation(merged.X.values) + assert right.Z.values._categories_match_up_to_permutation(merged.Z.values) @pytest.mark.parametrize( "change", @@ -1725,7 +1725,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values.categories_match_up_to_permutation(right.X.values) + # assert not left.X.values._categories_match_up_to_permutation(right.X.values) merged = pd.merge(left, right, on="X", how=join_type) From af35fd2ddb3026d1ce04dbd63d08e90886b58052 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 2 Nov 2020 07:39:00 -0800 Subject: [PATCH 5/5] fixup copy/paste --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b5d69cc9065ff..13fd0c5df1229 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2126,7 +2126,7 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ - def categories_match_up_to_permutation(self, other: "Categorical") -> bool: + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered