Skip to content

Commit

Permalink
DEPR: Deprecate ordered=None for CategoricalDtype (#26403)
Browse files Browse the repository at this point in the history
  • Loading branch information
jschendel authored Jul 3, 2019
1 parent 7ab9ff5 commit 8393e37
Show file tree
Hide file tree
Showing 12 changed files with 163 additions and 48 deletions.
22 changes: 16 additions & 6 deletions doc/source/whatsnew/v0.23.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -935,13 +935,23 @@ In previous versions, the default value for the ``ordered`` parameter was ``Fals

New behavior:

.. ipython:: python
.. code-block:: ipython
from pandas.api.types import CategoricalDtype
cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
cat
cdt = CategoricalDtype(categories=list('cbad'))
cat.astype(cdt)
In [2]: from pandas.api.types import CategoricalDtype
In [3]: cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
In [4]: cat
Out[4]:
[a, b, c, a, b, a]
Categories (3, object): [c < b < a]
In [5]: cdt = CategoricalDtype(categories=list('cbad'))
In [6]: cat.astype(cdt)
Out[6]:
[a, b, c, a, b, a]
Categories (4, object): [c < b < a < d]
Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``.

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ Other deprecations
- :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`)
- :meth:`Series.put` is deprecated. (:issue:`18262`)
- :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`)
- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`)
- :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`).
.. _whatsnew_0250.prior_deprecations:
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
# sanitize input
if is_categorical_dtype(values):
if dtype.categories is None:
dtype = CategoricalDtype(values.categories, dtype.ordered)
dtype = CategoricalDtype(values.categories, dtype._ordered)
elif not isinstance(values, (ABCIndexClass, ABCSeries)):
# sanitize_array coerces np.nan to a string under certain versions
# of numpy
Expand All @@ -355,7 +355,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
codes, categories = factorize(values, sort=True)
except TypeError:
codes, categories = factorize(values, sort=False)
if dtype.ordered:
if dtype._ordered:
# raise, as we don't have a sortable data structure and so
# the user should give us one by specifying categories
raise TypeError("'values' is not ordered, please "
Expand All @@ -368,7 +368,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
"supported at this time")

# we're inferring from values
dtype = CategoricalDtype(categories, dtype.ordered)
dtype = CategoricalDtype(categories, dtype._ordered)

elif is_categorical_dtype(values):
old_codes = (values._values.codes if isinstance(values, ABCSeries)
Expand Down Expand Up @@ -433,7 +433,7 @@ def ordered(self):
"""
Whether the categories have an ordered relationship.
"""
return self.dtype.ordered
return self.dtype._ordered

@property
def dtype(self) -> CategoricalDtype:
Expand Down Expand Up @@ -847,7 +847,7 @@ def set_categories(self, new_categories, ordered=None, rename=False,
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if ordered is None:
ordered = self.dtype.ordered
ordered = self.dtype._ordered
new_dtype = CategoricalDtype(new_categories, ordered=ordered)

cat = self if inplace else self.copy()
Expand Down
62 changes: 44 additions & 18 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

str_type = str

# GH26403: sentinel value used for the default value of ordered in the
# CategoricalDtype constructor to detect when ordered=None is explicitly passed
ordered_sentinel = object() # type: object

# TODO(GH26403): Replace with Optional[bool] or bool
OrderedType = Union[None, bool, object]


def register_extension_dtype(cls: Type[ExtensionDtype],
) -> Type[ExtensionDtype]:
Expand Down Expand Up @@ -214,7 +221,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
_metadata = ('categories', 'ordered')
_cache = {} # type: Dict[str_type, PandasExtensionDtype]

def __init__(self, categories=None, ordered: Optional[bool] = None):
def __init__(self,
categories=None,
ordered: OrderedType = ordered_sentinel):
self._finalize(categories, ordered, fastpath=False)

@classmethod
Expand All @@ -230,7 +239,7 @@ def _from_fastpath(cls,
def _from_categorical_dtype(cls,
dtype: 'CategoricalDtype',
categories=None,
ordered: Optional[bool] = None,
ordered: OrderedType = None,
) -> 'CategoricalDtype':
if categories is ordered is None:
return dtype
Expand Down Expand Up @@ -330,19 +339,20 @@ def _from_values_or_dtype(cls,

def _finalize(self,
categories,
ordered: Optional[bool],
ordered: OrderedType,
fastpath: bool = False,
) -> None:

if ordered is not None:
if ordered is not None and ordered is not ordered_sentinel:
self.validate_ordered(ordered)

if categories is not None:
categories = self.validate_categories(categories,
fastpath=fastpath)

self._categories = categories
self._ordered = ordered
self._ordered = ordered if ordered is not ordered_sentinel else None
self._ordered_from_sentinel = ordered is ordered_sentinel

def __setstate__(self, state: Dict[str_type, Any]) -> None:
# for pickle compat. __get_state__ is defined in the
Expand All @@ -355,12 +365,12 @@ def __hash__(self) -> int:
# _hash_categories returns a uint64, so use the negative
# space for when we have unknown categories to avoid a conflict
if self.categories is None:
if self.ordered:
if self._ordered:
return -1
else:
return -2
# We *do* want to include the real self.ordered here
return int(self._hash_categories(self.categories, self.ordered))
return int(self._hash_categories(self.categories, self._ordered))

def __eq__(self, other: Any) -> bool:
"""
Expand All @@ -379,7 +389,7 @@ def __eq__(self, other: Any) -> bool:
return other == self.name
elif other is self:
return True
elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')):
return False
elif self.categories is None or other.categories is None:
# We're forced into a suboptimal corner thanks to math and
Expand All @@ -388,10 +398,10 @@ def __eq__(self, other: Any) -> bool:
# CDT(., .) = CDT(None, False) and *all*
# CDT(., .) = CDT(None, True).
return True
elif self.ordered or other.ordered:
elif self._ordered or other._ordered:
# At least one has ordered=True; equal if both have ordered=True
# and the same values for categories in the same order.
return ((self.ordered == other.ordered) and
return ((self._ordered == other._ordered) and
self.categories.equals(other.categories))
else:
# Neither has ordered=True; equal if both have the same categories,
Expand All @@ -406,10 +416,10 @@ def __repr__(self):
data = "None, "
else:
data = self.categories._format_data(name=self.__class__.__name__)
return tpl.format(data, self.ordered)
return tpl.format(data, self._ordered)

@staticmethod
def _hash_categories(categories, ordered: Optional[bool] = True) -> int:
def _hash_categories(categories, ordered: OrderedType = True) -> int:
from pandas.core.util.hashing import (
hash_array, _combine_hash_arrays, hash_tuples
)
Expand Down Expand Up @@ -459,7 +469,7 @@ def construct_array_type(cls):
return Categorical

@staticmethod
def validate_ordered(ordered: bool) -> None:
def validate_ordered(ordered: OrderedType) -> None:
"""
Validates that we have a valid ordered parameter. If
it is not a boolean, a TypeError will be raised.
Expand Down Expand Up @@ -534,17 +544,25 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
msg = ('a CategoricalDtype must be passed to perform an update, '
'got {dtype!r}').format(dtype=dtype)
raise ValueError(msg)
elif dtype.categories is not None and dtype.ordered is self.ordered:
return dtype

# dtype is CDT: keep current categories/ordered if None
new_categories = dtype.categories
if new_categories is None:
new_categories = self.categories

new_ordered = dtype.ordered
new_ordered = dtype._ordered
new_ordered_from_sentinel = dtype._ordered_from_sentinel
if new_ordered is None:
new_ordered = self.ordered
# maintain existing ordered if new dtype has ordered=None
new_ordered = self._ordered
if self._ordered and new_ordered_from_sentinel:
# only warn if we'd actually change the existing behavior
msg = ("Constructing a CategoricalDtype without specifying "
"`ordered` will default to `ordered=False` in a future "
"version, which will cause the resulting categorical's "
"`ordered` attribute to change to False; `ordered=True`"
" must be explicitly passed in order to be retained")
warnings.warn(msg, FutureWarning, stacklevel=3)

return CategoricalDtype(new_categories, new_ordered)

Expand All @@ -556,10 +574,18 @@ def categories(self):
return self._categories

@property
def ordered(self) -> Optional[bool]:
def ordered(self) -> OrderedType:
"""
Whether the categories have an ordered relationship.
"""
# TODO: remove if block when ordered=None as default is deprecated
if self._ordered_from_sentinel and self._ordered is None:
# warn when accessing ordered if ordered=None and None was not
# explicitly passed to the constructor
msg = ("Constructing a CategoricalDtype without specifying "
"`ordered` will default to `ordered=False` in a future "
"version; `ordered=None` must be explicitly passed.")
warnings.warn(msg, FutureWarning, stacklevel=2)
return self._ordered

@property
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure):
# We *do* allow casting to categorical, since we know
# that Categorical is the only array type for 'category'.
subarr = Categorical(arr, dtype.categories,
ordered=dtype.ordered)
ordered=dtype._ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
_is_unorderable_exception, ensure_platform_int, is_bool,
_is_unorderable_exception, ensure_platform_int, is_bool, is_categorical,
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
Expand Down Expand Up @@ -170,6 +170,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
if data is None:
data = {}
if dtype is not None:
# GH 26336: explicitly handle 'category' to avoid warning
# TODO: Remove after CategoricalDtype defaults to ordered=False
if (isinstance(dtype, str) and dtype == 'category' and
is_categorical(data)):
dtype = data.dtype

dtype = self._validate_dtype(dtype)

if isinstance(data, MultiIndex):
Expand Down
9 changes: 2 additions & 7 deletions pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,14 +618,9 @@ def decode(obj):
return Interval(obj['left'], obj['right'], obj['closed'])
elif typ == 'series':
dtype = dtype_for(obj['dtype'])
pd_dtype = pandas_dtype(dtype)

index = obj['index']
result = Series(unconvert(obj['data'], dtype, obj['compress']),
index=index,
dtype=pd_dtype,
name=obj['name'])
return result
data = unconvert(obj['data'], dtype, obj['compress'])
return Series(data, index=index, dtype=dtype, name=obj['name'])

elif typ == 'block_manager':
axes = obj['axes']
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
expected = cat
tm.assert_categorical_equal(result, expected)

def test_astype_category_ordered_none_deprecated(self):
# GH 26336
cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True)
cdt2 = CategoricalDtype(categories=list('cedafb'))
cat = Categorical(list('abcdaba'), dtype=cdt1)
with tm.assert_produces_warning(FutureWarning):
cat.astype(cdt2)

def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
Expand Down
33 changes: 26 additions & 7 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype,
is_period, is_period_dtype, is_string_dtype)
from pandas.core.dtypes.dtypes import (
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry)
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype,
ordered_sentinel, registry)

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -54,7 +55,8 @@ def test_pickle(self):
class TestCategoricalDtype(Base):

def create(self):
return CategoricalDtype()
# TODO(GH 26403): Remove when default ordered becomes False
return CategoricalDtype(ordered=None)

def test_pickle(self):
# make sure our cache is NOT pickled
Expand Down Expand Up @@ -675,7 +677,8 @@ def test_unordered_same(self, ordered):
def test_categories(self):
result = CategoricalDtype(['a', 'b', 'c'])
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
assert result.ordered is None
with tm.assert_produces_warning(FutureWarning):
assert result.ordered is None

def test_equal_but_different(self, ordered_fixture):
c1 = CategoricalDtype([1, 2, 3])
Expand Down Expand Up @@ -804,7 +807,8 @@ def test_categorical_categories(self):

@pytest.mark.parametrize('new_categories', [
list('abc'), list('cba'), list('wxyz'), None])
@pytest.mark.parametrize('new_ordered', [True, False, None])
@pytest.mark.parametrize('new_ordered', [
True, False, None, ordered_sentinel])
def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
dtype = CategoricalDtype(list('abc'), ordered_fixture)
new_dtype = CategoricalDtype(new_categories, new_ordered)
Expand All @@ -813,11 +817,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
if expected_categories is None:
expected_categories = dtype.categories

expected_ordered = new_dtype.ordered
if expected_ordered is None:
expected_ordered = new_ordered
if new_ordered is ordered_sentinel or new_ordered is None:
expected_ordered = dtype.ordered

result = dtype.update_dtype(new_dtype)
# GH 26336
if new_ordered is ordered_sentinel and ordered_fixture is True:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = dtype.update_dtype(new_dtype)
else:
result = dtype.update_dtype(new_dtype)

tm.assert_index_equal(result.categories, expected_categories)
assert result.ordered is expected_ordered

Expand All @@ -837,6 +848,14 @@ def test_update_dtype_errors(self, bad_dtype):
with pytest.raises(ValueError, match=msg):
dtype.update_dtype(bad_dtype)

@pytest.mark.parametrize('ordered', [ordered_sentinel, None, True, False])
def test_ordered_none_default_deprecated(self, ordered):
# GH 26403: CDT.ordered only warns if ordered is not explicitly passed
dtype = CategoricalDtype(list('abc'), ordered=ordered)
warning = FutureWarning if ordered is ordered_sentinel else None
with tm.assert_produces_warning(warning):
dtype.ordered


@pytest.mark.parametrize('dtype', [
CategoricalDtype,
Expand Down
Loading

0 comments on commit 8393e37

Please sign in to comment.