From 6fd24d897aaa584cd821813b5b3bcfbaf7b10cb4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jan 2018 09:42:54 -0600 Subject: [PATCH] REF: Created pandas.core.arrays Moved pandas.core.categorical to arrays. --- pandas/_libs/parsers.pyx | 2 +- pandas/compat/pickle_compat.py | 6 +- pandas/core/api.py | 2 +- pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/categorical.py | 2331 +++++++++++++++++++++++++ pandas/core/categorical.py | 2334 +------------------------- pandas/core/dtypes/concat.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/groupby.py | 2 +- pandas/core/indexes/category.py | 8 +- pandas/core/indexes/multi.py | 6 +- pandas/core/internals.py | 2 +- pandas/core/reshape/concat.py | 4 +- pandas/core/reshape/melt.py | 2 +- pandas/core/reshape/reshape.py | 3 +- pandas/core/series.py | 2 +- pandas/core/sorting.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/pytables.py | 3 +- pandas/io/stata.py | 2 +- pandas/tests/api/test_api.py | 7 + pandas/tests/categorical/test_api.py | 2 +- pandas/tests/series/test_api.py | 3 +- 23 files changed, 2375 insertions(+), 2355 deletions(-) create mode 100644 pandas/core/arrays/__init__.py create mode 100644 pandas/core/arrays/categorical.py diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cf63b5083885ef..75cf0a88e37c1a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -55,7 +55,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_object_dtype, is_datetime64_dtype, pandas_dtype) -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as com diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 07b34961ce25d7..f651fbbf563165 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -108,7 +108,11 @@ def load_reduce(self): ('pandas.tseries.index', 'DatetimeIndex'): ('pandas.core.indexes.datetimes', 'DatetimeIndex'), ('pandas.tseries.period', 'PeriodIndex'): - ('pandas.core.indexes.period', 'PeriodIndex') + ('pandas.core.indexes.period', 'PeriodIndex'), + + # 19269, arrays moving + ('pandas.core.categorical', 'Categorical'): + ('pandas.core.arrays', 'Categorical'), } diff --git a/pandas/core/api.py b/pandas/core/api.py index b228a97c990747..aa37ddffa11564 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,7 +6,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core.groupby import Grouper from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py new file mode 100644 index 00000000000000..ee32b12f0e7121 --- /dev/null +++ b/pandas/core/arrays/__init__.py @@ -0,0 +1 @@ +from .categorical import Categorical # noqa diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py new file mode 100644 index 00000000000000..708f903cd73cb4 --- /dev/null +++ b/pandas/core/arrays/categorical.py @@ -0,0 +1,2331 @@ +# pylint: disable=E1101,W0232 + +import numpy as np +from warnings import warn +import types + +from pandas import compat +from pandas.compat import u, lzip +from pandas._libs import lib, algos as libalgos + +from pandas.core.dtypes.generic import ( + ABCSeries, ABCIndexClass, ABCCategoricalIndex) +from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.cast import ( + maybe_infer_to_datetimelike, + coerce_indexer_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.common import ( + _ensure_int64, + _ensure_object, + _ensure_platform_int, + is_dtype_equal, + is_datetimelike, + is_datetime64_dtype, + is_timedelta64_dtype, + is_categorical, + is_categorical_dtype, + is_list_like, is_sequence, + is_scalar, + is_dict_like) +from pandas.core.common import is_null_slice, _maybe_box_datetimelike + +from pandas.core.algorithms import factorize, take_1d, unique1d +from pandas.core.accessor import PandasDelegate +from pandas.core.base import (PandasObject, + NoNewAttributesMixin, _shared_docs) +import pandas.core.common as com +from pandas.core.missing import interpolate_2d +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + Appender, cache_readonly, deprecate_kwarg, Substitution) + +from pandas.io.formats.terminal import get_terminal_size +from pandas.util._validators import validate_bool_kwarg +from pandas.core.config import get_option + + +def _cat_compare_op(op): + def f(self, other): + # On python2, you can usually compare any type to any type, and + # Categoricals can be seen as a custom type, but having different + # results depending whether categories are the same or not is kind of + # insane, so be a bit stricter here and use the python3 idea of + # comparing only things of equal type. + if not self.ordered: + if op in ['__lt__', '__gt__', '__le__', '__ge__']: + raise TypeError("Unordered Categoricals can only compare " + "equality or not") + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the categories are + # the same (maybe up to ordering, depending on ordered) + + msg = ("Categoricals can only be compared if " + "'categories' are the same.") + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif (self.ordered and not (self.categories == + other.categories).all()): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): + raise TypeError(msg) + + if not (self.ordered == other.ordered): + raise TypeError("Categoricals can only be compared if " + "'ordered' is the same") + if not self.ordered and not self.categories.equals( + other.categories): + # both unordered and different order + other_codes = _get_codes_for_values(other, self.categories) + else: + other_codes = other._codes + + na_mask = (self._codes == -1) | (other_codes == -1) + f = getattr(self._codes, op) + ret = f(other_codes) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret + + # Numpy-1.9 and earlier may convert a scalar to a zerodim array during + # comparison operation when second arg has higher priority, e.g. + # + # cat[0] < cat + # + # With cat[0], for example, being ``np.int64(1)`` by the time it gets + # into this function would become ``np.array(1)``. + other = lib.item_from_zerodim(other) + if is_scalar(other): + if other in self.categories: + i = self.categories.get_loc(other) + return getattr(self._codes, op)(i) + else: + if op == '__eq__': + return np.repeat(False, len(self)) + elif op == '__ne__': + return np.repeat(True, len(self)) + else: + msg = ("Cannot compare a Categorical for op {op} with a " + "scalar, which is not a category.") + raise TypeError(msg.format(op=op)) + else: + + # allow categorical vs object dtype array comparisons for equality + # these are only positional comparisons + if op in ['__eq__', '__ne__']: + return getattr(np.array(self), op)(np.array(other)) + + msg = ("Cannot compare a Categorical for op {op} with type {typ}." + "\nIf you want to compare values, use 'np.asarray(cat) " + " other'.") + raise TypeError(msg.format(op=op, typ=type(other))) + + f.__name__ = op + + return f + + +def _maybe_to_categorical(array): + """ + Coerce to a categorical if a series is given. + + Internal use ONLY. + """ + if isinstance(array, (ABCSeries, ABCCategoricalIndex)): + return array._values + elif isinstance(array, np.ndarray): + return Categorical(array) + return array + + +_codes_doc = """The category codes of this categorical. + +Level codes are an array if integer which are the positions of the real +values in the categories array. + +There is not setter, use the other categorical methods and the normal item +setter to change values in the categorical. +""" + + +class Categorical(PandasObject): + """ + Represents a categorical variable in classic R / S-plus fashion + + `Categoricals` can only take on only a limited, and usually fixed, number + of possible values (`categories`). In contrast to statistical categorical + variables, a `Categorical` might have an order, but numerical operations + (additions, divisions, ...) are not possible. + + All values of the `Categorical` are either in `categories` or `np.nan`. + Assigning values outside of `categories` will raise a `ValueError`. Order + is defined by the order of the `categories`, not lexical order of the + values. + + Parameters + ---------- + values : list-like + The values of the categorical. If categories are given, values not in + categories will be replaced with NaN. + categories : Index-like (unique), optional + The unique categories for this categorical. If not given, the + categories are assumed to be the unique values of values. + ordered : boolean, (default False) + Whether or not this categorical is treated as a ordered categorical. + If not given, the resulting categorical will not be ordered. + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical + + .. versionadded:: 0.21.0 + + Attributes + ---------- + categories : Index + The categories of this categorical + codes : ndarray + The codes (integer positions, which point to the categories) of this + categorical, read only. + ordered : boolean + Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + .. versionadded:: 0.21.0 + + Methods + ------- + from_codes + __array__ + + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + + Examples + -------- + >>> pd.Categorical([1, 2, 3, 1, 2, 3]) + [1, 2, 3, 1, 2, 3] + Categories (3, int64): [1, 2, 3] + + >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + [a, b, c, a, b, c] + Categories (3, object): [a, b, c] + + Ordered `Categoricals` can be sorted according to the custom order + of the categories and can have a min and max value. + + >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> c + [a, b, c, a, b, c] + Categories (3, object): [c < b < a] + >>> c.min() + 'c' + + Notes + ----- + See the `user guide + `_ for more. + + See also + -------- + pandas.api.types.CategoricalDtype : Type for categorical data + CategoricalIndex : An Index with an underlying ``Categorical`` + """ + + # For comparisons, so that numpy uses our implementation if the compare + # ops, which raise + __array_priority__ = 1000 + _dtype = CategoricalDtype() + _deprecations = frozenset(['labels']) + _typ = 'categorical' + + def __init__(self, values, categories=None, ordered=None, dtype=None, + fastpath=False): + + # Ways of specifying the dtype (prioritized ordered) + # 1. dtype is a CategoricalDtype + # a.) with known categories, use dtype.categories + # b.) else with Categorical values, use values.dtype + # c.) else, infer from values + # d.) specifying dtype=CategoricalDtype and categories is an error + # 2. dtype is a string 'category' + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + # 3. dtype is None + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + msg = "Unknown `dtype` {dtype}" + raise ValueError(msg.format(dtype=dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify both `dtype` and `categories`" + " or `ordered`.") + + categories = dtype.categories + ordered = dtype.ordered + + elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + # If dtype=None and values is not categorical, create a new dtype + dtype = CategoricalDtype(categories, ordered) + + # At this point, dtype is always a CategoricalDtype + # if dtype.categories is None, we are inferring + + if fastpath: + self._codes = coerce_indexer_dtype(values, categories) + self._dtype = dtype + return + + # null_mask indicates missing values we want to exclude from inference. + # This means: only missing values in list-likes (not arrays/ndframes). + null_mask = np.array(False) + + # sanitize input + if is_categorical_dtype(values): + if dtype.categories is None: + dtype = CategoricalDtype(values.categories, dtype.ordered) + + elif not isinstance(values, (ABCIndexClass, ABCSeries)): + # _sanitize_array coerces np.nan to a string under certain versions + # of numpy + values = maybe_infer_to_datetimelike(values, convert_dates=True) + if not isinstance(values, np.ndarray): + values = _convert_to_list_like(values) + from pandas.core.series import _sanitize_array + # By convention, empty lists result in object dtype: + if len(values) == 0: + sanitize_dtype = 'object' + else: + sanitize_dtype = None + null_mask = isna(values) + if null_mask.any(): + values = [values[idx] for idx in np.where(~null_mask)[0]] + values = _sanitize_array(values, None, dtype=sanitize_dtype) + + if dtype.categories is None: + try: + codes, categories = factorize(values, sort=True) + except TypeError: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError("'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument.") + except ValueError: + + # FIXME + raise NotImplementedError("> 1 ndim Categorical are not " + "supported at this time") + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) + + elif is_categorical_dtype(values): + old_codes = (values.cat.codes if isinstance(values, ABCSeries) + else values.codes) + codes = _recode_for_categories(old_codes, values.dtype.categories, + dtype.categories) + + else: + codes = _get_codes_for_values(values, dtype.categories) + + if null_mask.any(): + # Reinsert -1 placeholders for previously removed missing values + full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) + full_codes[~null_mask] = codes + codes = full_codes + + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + """The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self.dtype.ordered + + @property + def dtype(self): + """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" + return self._dtype + + @property + def _constructor(self): + return Categorical + + def copy(self): + """ Copy constructor. """ + return self._constructor(values=self._codes.copy(), + categories=self.categories, + ordered=self.ordered, + fastpath=True) + + def astype(self, dtype, copy=True): + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + + .. versionadded:: 0.19.0 + + """ + if is_categorical_dtype(dtype): + # GH 10696/18593 + dtype = self.dtype._update_dtype(dtype) + self = self.copy() if copy else self + if dtype == self.dtype: + return self + return self._set_dtype(dtype) + return np.array(self, dtype=dtype, copy=copy) + + @cache_readonly + def ndim(self): + """Number of dimensions of the Categorical """ + return self._codes.ndim + + @cache_readonly + def size(self): + """ return the len of myself """ + return len(self) + + @cache_readonly + def itemsize(self): + """ return the size of a single category """ + return self.categories.itemsize + + def tolist(self): + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + """ + if is_datetimelike(self.categories): + return [_maybe_box_datetimelike(x) for x in self] + return np.array(self).tolist() + + @property + def base(self): + """ compat, we are always our own object """ + return None + + @classmethod + def _from_inferred_categories(cls, inferred_categories, inferred_codes, + dtype): + """Construct a Categorical from inferred values + + For inferred categories (`dtype` is None) the categories are sorted. + For explicit `dtype`, the `inferred_categories` are cast to the + appropriate type. + + Parameters + ---------- + + inferred_categories : Index + inferred_codes : Index + dtype : CategoricalDtype or 'category' + + Returns + ------- + Categorical + """ + from pandas import Index, to_numeric, to_datetime, to_timedelta + + cats = Index(inferred_categories) + + known_categories = (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None) + + if known_categories: + # Convert to a specialzed type with `dtype` if specified + if dtype.categories.is_numeric(): + cats = to_numeric(inferred_categories, errors='coerce') + elif is_datetime64_dtype(dtype.categories): + cats = to_datetime(inferred_categories, errors='coerce') + elif is_timedelta64_dtype(dtype.categories): + cats = to_timedelta(inferred_categories, errors='coerce') + + if known_categories: + # recode from observation oder to dtype.categories order + categories = dtype.categories + codes = _recode_for_categories(inferred_codes, cats, categories) + elif not cats.is_monotonic_increasing: + # sort categories and recode for unknown categories + unsorted = cats.copy() + categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, + categories) + dtype = CategoricalDtype(categories, ordered=False) + else: + dtype = CategoricalDtype(cats, ordered=False) + codes = inferred_codes + + return cls(codes, dtype=dtype, fastpath=True) + + @classmethod + def from_codes(cls, codes, categories, ordered=False): + """ + Make a Categorical type from codes and categories arrays. + + This constructor is useful if you already have codes and categories and + so do not need the (computation intensive) factorization step, which is + usually done on the constructor. + + If your data does not follow this convention, please use the normal + constructor. + + Parameters + ---------- + codes : array-like, integers + An integer array, where each integer points to a category in + categories or -1 for NaN + categories : index-like + The categories for the categorical. Items need to be unique. + ordered : boolean, (default False) + Whether or not this categorical is treated as a ordered + categorical. If not given, the resulting categorical will be + unordered. + """ + try: + codes = np.asarray(codes, np.int64) + except (ValueError, TypeError): + raise ValueError( + "codes need to be convertible to an arrays of integers") + + categories = CategoricalDtype._validate_categories(categories) + + if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and " + "len(categories)-1") + + return cls(codes, categories=categories, ordered=ordered, + fastpath=True) + + _codes = None + + def _get_codes(self): + """ Get the codes. + + Returns + ------- + codes : integer array view + A non writable view of the `codes` array. + """ + v = self._codes.view() + v.flags.writeable = False + return v + + def _set_codes(self, codes): + """ + Not settable by the user directly + """ + raise ValueError("cannot set Categorical codes directly") + + codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) + + def _set_categories(self, categories, fastpath=False): + """ Sets new categories inplace + + Parameters + ---------- + fastpath : boolean (default: False) + Don't perform validation of the categories for uniqueness or nulls + + Examples + -------- + >>> c = Categorical(['a', 'b']) + >>> c + [a, b] + Categories (2, object): [a, b] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + [a, c] + Categories (2, object): [a, c] + """ + + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, + self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (not fastpath and self.dtype.categories is not None and + len(new_dtype.categories) != len(self.dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items than the old categories!") + + self._dtype = new_dtype + + def _codes_for_groupby(self, sort): + """ + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + sort : boolean + The value of the sort parameter groupby was called with. + + Returns + ------- + Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + """ + + # Already sorted according to self.categories; all is fine + if sort: + return self + + # sort=False should order groups in as-encountered order (GH-8868) + cat = self.unique() + + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat.add_categories( + self.categories[~self.categories.isin(cat.categories)], + inplace=True) + + return self.reorder_categories(cat.categories) + + def _set_dtype(self, dtype): + """Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, + dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) + + def set_ordered(self, value, inplace=False): + """ + Sets the ordered attribute to the boolean value + + Parameters + ---------- + value : boolean to set whether this categorical is ordered (True) or + not (False) + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to the value + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + new_dtype = CategoricalDtype(self.categories, ordered=value) + cat = self if inplace else self.copy() + cat._dtype = new_dtype + if not inplace: + return cat + + def as_ordered(self, inplace=False): + """ + Sets the Categorical to be ordered + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to True + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + return self.set_ordered(True, inplace=inplace) + + def as_unordered(self, inplace=False): + """ + Sets the Categorical to be unordered + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to False + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + return self.set_ordered(False, inplace=inplace) + + def set_categories(self, new_categories, ordered=None, rename=False, + inplace=False): + """ Sets the categories to the specified new_categories. + + `new_categories` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to NaN). If `rename==True`, the categories will simple be renamed + (less or more items than in old categories will result in values set to + NaN or in unused categories respectively). + + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes on python3, which does not considers a S1 string equal to a + single char python string. + + Raises + ------ + ValueError + If new_categories does not validate as categories + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : boolean, (default: False) + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + rename : boolean (default: False) + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of + this categorical with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + + cat = self if inplace else self.copy() + if rename: + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): + # remove all _codes which are larger and set to -1/NaN + self._codes[self._codes >= len(new_dtype.categories)] = -1 + else: + codes = _recode_for_categories(self.codes, self.categories, + new_dtype.categories) + cat._codes = codes + cat._dtype = new_dtype + + if not inplace: + return cat + + def rename_categories(self, new_categories, inplace=False): + """ Renames categories. + + Raises + ------ + ValueError + If new categories are list-like and do not have the same number of + items than the current categories or do not validate as categories + + Parameters + ---------- + new_categories : list-like, dict-like or callable + + * list-like: all items must be unique and the number of items in + the new categories must match the existing number of categories. + + * dict-like: specifies a mapping from + old categories to new. Categories not contained in the mapping + are passed through and extra categories in the mapping are + ignored. + + .. versionadded:: 0.21.0 + + * callable : a callable that is called on all items in the old + categories and whose return values comprise the new categories. + + .. versionadded:: 0.23.0 + + .. warning:: + + Currently, Series are considered list like. In a future version + of pandas they'll be considered dict-like. + + inplace : boolean (default: False) + Whether or not to rename the categories inplace or return a copy of + this categorical with renamed categories. + + Returns + ------- + cat : Categorical or None + With ``inplace=False``, the new categorical is returned. + With ``inplace=True``, there is no return value. + + See also + -------- + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + + Examples + -------- + >>> c = Categorical(['a', 'a', 'b']) + >>> c.rename_categories([0, 1]) + [0, 0, 1] + Categories (2, int64): [0, 1] + + For dict-like ``new_categories``, extra keys are ignored and + categories not in the dictionary are passed through + + >>> c.rename_categories({'a': 'A', 'c': 'C'}) + [A, A, b] + Categories (2, object): [A, b] + + You may also provide a callable to create the new categories + + >>> c.rename_categories(lambda x: x.upper()) + [A, A, B] + Categories (2, object): [A, B] + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + cat = self if inplace else self.copy() + + if isinstance(new_categories, ABCSeries): + msg = ("Treating Series 'new_categories' as a list-like and using " + "the values. In a future version, 'rename_categories' will " + "treat Series like a dictionary.\n" + "For dict-like, use 'new_categories.to_dict()'\n" + "For list-like, use 'new_categories.values'.") + warn(msg, FutureWarning, stacklevel=2) + new_categories = list(new_categories) + + if is_dict_like(new_categories): + cat.categories = [new_categories.get(item, item) + for item in cat.categories] + elif callable(new_categories): + cat.categories = [new_categories(item) for item in cat.categories] + else: + cat.categories = new_categories + if not inplace: + return cat + + def reorder_categories(self, new_categories, ordered=None, inplace=False): + """ Reorders categories as specified in new_categories. + + `new_categories` need to include all old categories and no new category + items. + + Raises + ------ + ValueError + If the new categories do not contain all old category items or any + new ones + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : boolean, optional + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of + this categorical with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + if set(self.dtype.categories) != set(new_categories): + raise ValueError("items in new_categories are not the same as in " + "old categories") + return self.set_categories(new_categories, ordered=ordered, + inplace=inplace) + + def add_categories(self, new_categories, inplace=False): + """ Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as + categories + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : boolean (default: False) + Whether or not to add the categories inplace or return a copy of + this categorical with added categories. + + Returns + ------- + cat : Categorical with new categories added or None if inplace. + + See also + -------- + rename_categories + reorder_categories + remove_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + if not is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self.dtype.categories) + if len(already_included) != 0: + msg = ("new categories must not include old categories: " + "{already_included!s}") + raise ValueError(msg.format(already_included=already_included)) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + + cat = self if inplace else self.copy() + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) + if not inplace: + return cat + + def remove_categories(self, removals, inplace=False): + """ Removes the specified categories. + + `removals` must be included in the old categories. Values which were in + the removed categories will be set to NaN + + Raises + ------ + ValueError + If the removals are not contained in the categories + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + inplace : boolean (default: False) + Whether or not to remove the categories inplace or return a copy of + this categorical with removed categories. + + Returns + ------- + cat : Categorical with removed categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + if not is_list_like(removals): + removals = [removals] + + removal_set = set(list(removals)) + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] + + # GH 10156 + if any(isna(removals)): + not_included = [x for x in not_included if notna(x)] + new_categories = [x for x in new_categories if notna(x)] + + if len(not_included) != 0: + msg = "removals must all be in old categories: {not_included!s}" + raise ValueError(msg.format(not_included=not_included)) + + return self.set_categories(new_categories, ordered=self.ordered, + rename=False, inplace=inplace) + + def remove_unused_categories(self, inplace=False): + """ Removes categories which are not used. + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to drop unused categories inplace or return a copy of + this categorical with unused categories dropped. + + Returns + ------- + cat : Categorical with unused categories dropped or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + cat = self if inplace else self.copy() + idx, inv = np.unique(cat._codes, return_inverse=True) + + if idx.size != 0 and idx[0] == -1: # na sentinel + idx, inv = idx[1:], inv - 1 + + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath(new_categories, + ordered=self.ordered) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) + + if not inplace: + return cat + + def map(self, mapper): + """Apply mapper function to its categories (not codes). + + Parameters + ---------- + mapper : callable + Function to be applied. When all categories are mapped + to different categories, the result will be Categorical which has + the same order property as the original. Otherwise, the result will + be np.ndarray. + + Returns + ------- + applied : Categorical or Index. + + """ + new_categories = self.categories.map(mapper) + try: + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) + except ValueError: + return np.take(new_categories, self._codes) + + __eq__ = _cat_compare_op('__eq__') + __ne__ = _cat_compare_op('__ne__') + __lt__ = _cat_compare_op('__lt__') + __gt__ = _cat_compare_op('__gt__') + __le__ = _cat_compare_op('__le__') + __ge__ = _cat_compare_op('__ge__') + + # for Series/ndarray like compat + @property + def shape(self): + """ Shape of the Categorical. + + For internal compatibility with numpy arrays. + + Returns + ------- + shape : tuple + """ + + return tuple([len(self._codes)]) + + def shift(self, periods): + """ + Shift Categorical by desired number of periods. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + + Returns + ------- + shifted : Categorical + """ + # since categoricals always have ndim == 1, an axis parameter + # doesn't make any sense here. + codes = self.codes + if codes.ndim > 1: + raise NotImplementedError("Categorical with ndim > 1.") + if np.prod(codes.shape) and (periods != 0): + codes = np.roll(codes, _ensure_platform_int(periods), axis=0) + if periods > 0: + codes[:periods] = -1 + else: + codes[periods:] = -1 + + return self.from_codes(codes, categories=self.categories, + ordered=self.ordered) + + def __array__(self, dtype=None): + """ + The numpy array interface. + + Returns + ------- + values : numpy array + A numpy array of either the specified dtype or, + if dtype==None (default), the same dtype as + categorical.categories.dtype + """ + ret = take_1d(self.categories.values, self._codes) + if dtype and not is_dtype_equal(dtype, self.categories.dtype): + return np.asarray(ret, dtype) + return ret + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if not isinstance(state, dict): + raise Exception('invalid pickle state') + + # Provide compatibility with pre-0.15.0 Categoricals. + if '_categories' not in state and '_levels' in state: + state['_categories'] = self.dtype._validate_categories(state.pop( + '_levels')) + if '_codes' not in state and 'labels' in state: + state['_codes'] = coerce_indexer_dtype( + state.pop('labels'), state['_categories']) + + # 0.16.0 ordered change + if '_ordered' not in state: + + # >=15.0 < 0.16.0 + if 'ordered' in state: + state['_ordered'] = state.pop('ordered') + else: + state['_ordered'] = False + + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + + for k, v in compat.iteritems(state): + setattr(self, k, v) + + @property + def T(self): + return self + + @property + def nbytes(self): + return self._codes.nbytes + self.dtype.categories.values.nbytes + + def memory_usage(self, deep=False): + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) + + @Substitution(klass='Categorical') + @Appender(_shared_docs['searchsorted']) + @deprecate_kwarg(old_arg_name='v', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): + if not self.ordered: + raise ValueError("Categorical not ordered\nyou can use " + ".as_ordered() to change the Categorical to an " + "ordered one") + + from pandas.core.series import Series + + values_as_codes = _get_codes_for_values(Series(value).values, + self.categories) + + if -1 in values_as_codes: + raise ValueError("Value(s) to be inserted must be in categories.") + + return self.codes.searchsorted(values_as_codes, side=side, + sorter=sorter) + + def isna(self): + """ + Detect missing values + + Both missing values (-1 in .codes) and NA as a category are detected. + + Returns + ------- + a boolean array of whether my values are null + + See also + -------- + isna : top-level isna + isnull : alias of isna + Categorical.notna : boolean inverse of Categorical.isna + + """ + + ret = self._codes == -1 + + # String/object and float categories can hold np.nan + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: + nan_pos = np.where(isna(self.categories))[0] + # we only have one NA in categories + ret = np.logical_or(ret, self._codes == nan_pos) + return ret + isnull = isna + + def notna(self): + """ + Inverse of isna + + Both missing values (-1 in .codes) and NA as a category are detected as + null. + + Returns + ------- + a boolean array of whether my values are not null + + See also + -------- + notna : top-level notna + notnull : alias of notna + Categorical.isna : boolean inverse of Categorical.notna + + """ + return ~self.isna() + notnull = notna + + def put(self, *args, **kwargs): + """ + Replace specific elements in the Categorical with given values. + """ + raise NotImplementedError(("'put' is not yet implemented " + "for Categorical")) + + def dropna(self): + """ + Return the Categorical without null values. + + Both missing values (-1 in .codes) and NA as a category are detected. + NA is removed from the categories if present. + + Returns + ------- + valid : Categorical + """ + result = self[self.notna()] + if isna(result.categories).any(): + result = result.remove_categories([np.nan]) + return result + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN, even if NaN is a category. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + + """ + from numpy import bincount + from pandas import isna, Series, CategoricalIndex + + obj = (self.remove_categories([np.nan]) if dropna and + isna(self.categories).any() else self) + code, cat = obj._codes, obj.categories + ncat, mask = len(cat), 0 <= code + ix, clean = np.arange(ncat), mask.all() + + if dropna or clean: + obs = code if clean else code[mask] + count = bincount(obs, minlength=ncat or None) + else: + count = bincount(np.where(mask, code, ncat)) + ix = np.append(ix, -1) + + ix = self._constructor(ix, dtype=self.dtype, + fastpath=True) + + return Series(count, index=CategoricalIndex(ix), dtype='int64') + + def get_values(self): + """ Return the values. + + For internal compatibility with pandas formatting. + + Returns + ------- + values : numpy array + A numpy array of the same dtype as categorical.categories.dtype or + Index if datetime / periods + """ + # if we are a datetime and period index, return Index to keep metadata + if is_datetimelike(self.categories): + return self.categories.take(self._codes, fill_value=np.nan) + return np.array(self) + + def check_for_ordered(self, op): + """ assert that we are ordered """ + if not self.ordered: + raise TypeError("Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n".format(op=op)) + + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + """ + Returns the indices that would sort the Categorical instance if + 'sort_values' was called. This function is implemented to provide + compatibility with numpy ndarray objects. + + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Returns + ------- + argsorted : numpy array + + See also + -------- + numpy.ndarray.argsort + """ + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + result = np.argsort(self._codes.copy(), kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result + + def sort_values(self, inplace=False, ascending=True, na_position='last'): + """ Sorts the Categorical by category value returning a new + Categorical by default. + + While an ordering is applied to the category values, sorting in this + context refers more to organizing and grouping together based on + matching category values. Thus, this function can be called on an + unordered Categorical instance unlike the functions 'Categorical.min' + and 'Categorical.max'. + + Parameters + ---------- + inplace : boolean, default False + Do operation in place. + ascending : boolean, default True + Order ascending. Passing False orders descending. The + ordering parameter provides the method by which the + category values are organized. + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + + Returns + ------- + y : Categorical or None + + See Also + -------- + Categorical.sort + Series.sort_values + + Examples + -------- + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + >>> c + [1, 2, 2, 1, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values() + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, 1, 1] + Categories (3, int64): [1, 2, 5] + + Inplace sorting can be done as well: + + >>> c.sort_values(inplace=True) + >>> c + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + + 'sort_values' behaviour with NaNs. Note that 'na_position' + is independent of the 'ascending' parameter: + + >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) + >>> c + [NaN, 2.0, 2.0, NaN, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values() + [2.0, 2.0, 5.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False) + [5.0, 2.0, 2.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(na_position='first') + [NaN, NaN, 2.0, 2.0, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False, na_position='first') + [NaN, NaN, 5.0, 2.0, 2.0] + Categories (2, int64): [2, 5] + """ + inplace = validate_bool_kwarg(inplace, 'inplace') + if na_position not in ['last', 'first']: + msg = 'invalid na_position: {na_position!r}' + raise ValueError(msg.format(na_position=na_position)) + + codes = np.sort(self._codes) + if not ascending: + codes = codes[::-1] + + # NaN handling + na_mask = (codes == -1) + if na_mask.any(): + n_nans = len(codes[na_mask]) + if na_position == "first": + # in this case sort to the front + new_codes = codes.copy() + new_codes[0:n_nans] = -1 + new_codes[n_nans:] = codes[~na_mask] + codes = new_codes + elif na_position == "last": + # ... and to the end + new_codes = codes.copy() + pos = len(codes) - n_nans + new_codes[0:pos] = codes[~na_mask] + new_codes[pos:] = -1 + codes = new_codes + if inplace: + self._codes = codes + return + else: + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) + + def _values_for_rank(self): + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN. + + Returns + ------- + numpy array + + """ + from pandas import Series + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype('float64') + values[mask] = np.nan + elif self.categories.is_numeric(): + values = np.array(self) + else: + # reorder the categories (so rank can use the float codes) + # instead of passing an object array to rank + values = np.array( + self.rename_categories(Series(self.categories).rank().values) + ) + return values + + def ravel(self, order='C'): + """ Return a flattened (numpy) array. + + For internal compatibility with numpy arrays. + + Returns + ------- + raveled : numpy array + """ + return np.array(self) + + def view(self): + """Return a view of myself. + + For internal compatibility with numpy arrays. + + Returns + ------- + view : Categorical + Returns `self`! + """ + return self + + def to_dense(self): + """Return my 'dense' representation + + For internal compatibility with numpy arrays. + + Returns + ------- + dense : array + """ + return np.asarray(self) + + @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') + def fillna(self, value=None, method=None, limit=None): + """ Fill NA/NaN values using the specified method. + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should either be in the categories or should be + NaN. + limit : int, default None + (Not implemented yet for Categorical!) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : Categorical with NA/NaN filled + """ + + if value is None: + value = np.nan + if limit is not None: + raise NotImplementedError("specifying a limit for fillna has not " + "been implemented yet") + + values = self._codes + + # Make sure that we also get NA in categories + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: + values = values.copy() + nan_pos = np.where(isna(self.categories))[0] + # we only have one NA in categories + values[values == nan_pos] = -1 + + # pad / bfill + if method is not None: + + values = self.to_dense().reshape(-1, len(self)) + values = interpolate_2d(values, method, 0, None, + value).astype(self.categories.dtype)[0] + values = _get_codes_for_values(values, self.categories) + + else: + + # If value is a dict or a Series (a dict value has already + # been converted to a Series) + if isinstance(value, ABCSeries): + if not value[~value.isin(self.categories)].isna().all(): + raise ValueError("fill value must be in categories") + + values_codes = _get_codes_for_values(value, self.categories) + indexer = np.where(values_codes != -1) + values[indexer] = values_codes[values_codes != -1] + + # If value is not a dict or Series it should be a scalar + elif is_scalar(value): + if not isna(value) and value not in self.categories: + raise ValueError("fill value must be in categories") + + mask = values == -1 + if mask.any(): + values = values.copy() + if isna(value): + values[mask] = -1 + else: + values[mask] = self.categories.get_loc(value) + + else: + raise TypeError('"value" parameter must be a scalar, dict ' + 'or Series, but you passed a ' + '"{0}"'.format(type(value).__name__)) + + return self._constructor(values, categories=self.categories, + ordered=self.ordered, fastpath=True) + + def take_nd(self, indexer, allow_fill=True, fill_value=None): + """ Take the codes by the indexer, fill with the fill_value. + + For internal compatibility with numpy arrays. + """ + + # filling must always be None/nan here + # but is passed thru internally + assert isna(fill_value) + + codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) + result = self._constructor(codes, categories=self.categories, + ordered=self.ordered, fastpath=True) + return result + + take = take_nd + + def _slice(self, slicer): + """ Return a slice of myself. + + For internal compatibility with numpy arrays. + """ + + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if isinstance(slicer, tuple) and len(slicer) == 2: + if not is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + _codes = self._codes[slicer] + return self._constructor(values=_codes, categories=self.categories, + ordered=self.ordered, fastpath=True) + + def __len__(self): + """The length of this Categorical.""" + return len(self._codes) + + def __iter__(self): + """Returns an Iterator over the values of this Categorical.""" + return iter(self.get_values()) + + def _tidy_repr(self, max_vals=10, footer=True): + """ a short repr displaying only max_vals and an optional (but default + footer) + """ + num = max_vals // 2 + head = self[:num]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) + + result = u('{head}, ..., {tail}').format(head=head[:-1], tail=tail[1:]) + if footer: + result = u('{result}\n{footer}').format(result=result, + footer=self._repr_footer()) + + return compat.text_type(result) + + def _repr_categories(self): + """ return the base repr for the categories """ + max_categories = (10 if get_option("display.max_categories") == 0 else + get_option("display.max_categories")) + from pandas.io.formats import format as fmt + if len(self.categories) > max_categories: + num = max_categories // 2 + head = fmt.format_array(self.categories[:num], None) + tail = fmt.format_array(self.categories[-num:], None) + category_strs = head + ["..."] + tail + else: + category_strs = fmt.format_array(self.categories, None) + + # Strip all leading spaces, which format_array adds for columns... + category_strs = [x.strip() for x in category_strs] + return category_strs + + def _repr_categories_info(self): + """ Returns a string representation of the footer.""" + + category_strs = self._repr_categories() + dtype = getattr(self.categories, 'dtype_str', + str(self.categories.dtype)) + + levheader = "Categories ({length}, {dtype}): ".format( + length=len(self.categories), dtype=dtype) + width, height = get_terminal_size() + max_width = get_option("display.width") or width + if com.in_ipython_frontend(): + # 0 = no breaks + max_width = 0 + levstring = "" + start = True + cur_col_len = len(levheader) # header + sep_len, sep = (3, " < ") if self.ordered else (2, ", ") + linesep = sep.rstrip() + "\n" # remove whitespace + for val in category_strs: + if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: + levstring += linesep + (" " * (len(levheader) + 1)) + cur_col_len = len(levheader) + 1 # header + a whitespace + elif not start: + levstring += sep + cur_col_len += len(val) + levstring += val + start = False + # replace to simple save space by + return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" + + def _repr_footer(self): + + return u('Length: {length}\n{info}').format( + length=len(self), info=self._repr_categories_info()) + + def _get_repr(self, length=True, na_rep='NaN', footer=True): + from pandas.io.formats import format as fmt + formatter = fmt.CategoricalFormatter(self, length=length, + na_rep=na_rep, footer=footer) + result = formatter.to_string() + return compat.text_type(result) + + def __unicode__(self): + """ Unicode representation. """ + _maxlen = 10 + if len(self._codes) > _maxlen: + result = self._tidy_repr(_maxlen) + elif len(self._codes) > 0: + result = self._get_repr(length=len(self) > _maxlen) + else: + msg = self._get_repr(length=False, footer=True).replace("\n", ", ") + result = ('[], {repr_msg}'.format(repr_msg=msg)) + + return result + + def _maybe_coerce_indexer(self, indexer): + """ return an indexer coerced to the codes dtype """ + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': + indexer = indexer.astype(self._codes.dtype) + return indexer + + def __getitem__(self, key): + """ Return an item. """ + if isinstance(key, (int, np.integer)): + i = self._codes[key] + if i == -1: + return np.nan + else: + return self.categories[i] + else: + return self._constructor(values=self._codes[key], + categories=self.categories, + ordered=self.ordered, fastpath=True) + + def __setitem__(self, key, value): + """ Item assignment. + + + Raises + ------ + ValueError + If (one or more) Value is not in categories or if a assigned + `Categorical` does not have the same categories + """ + + # require identical categories set + if isinstance(value, Categorical): + if not value.categories.equals(self.categories): + raise ValueError("Cannot set a Categorical with another, " + "without identical categories") + + rvalue = value if is_list_like(value) else [value] + + from pandas import Index + to_add = Index(rvalue).difference(self.categories) + + # no assignments of values not in categories, but it's always ok to set + # something to np.nan + if len(to_add) and not isna(to_add).all(): + raise ValueError("Cannot setitem on a Categorical with a new " + "category, set the categories first") + + # set by position + if isinstance(key, (int, np.integer)): + pass + + # tuple of indexers (dataframe) + elif isinstance(key, tuple): + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if len(key) == 2: + if not is_null_slice(key[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + key = key[1] + elif len(key) == 1: + key = key[0] + else: + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + + # slicing in Series or Categorical + elif isinstance(key, slice): + pass + + # Array of True/False in Series or Categorical + else: + # There is a bug in numpy, which does not accept a Series as a + # indexer + # https://github.com/pandas-dev/pandas/issues/6168 + # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 + # FIXME: remove when numpy 1.9 is the lowest numpy version pandas + # accepts... + key = np.asarray(key) + + lindexer = self.categories.get_indexer(rvalue) + + # FIXME: the following can be removed after GH7820 is fixed: + # https://github.com/pandas-dev/pandas/issues/7820 + # float categories do currently return -1 for np.nan, even if np.nan is + # included in the index -> "repair" this here + if isna(rvalue).any() and isna(self.categories).any(): + nan_pos = np.where(isna(self.categories))[0] + lindexer[lindexer == -1] = nan_pos + + lindexer = self._maybe_coerce_indexer(lindexer) + self._codes[key] = lindexer + + def _reverse_indexer(self): + """ + Compute the inverse of a categorical, returning + a dict of categories -> indexers. + + *This is an internal function* + + Returns + ------- + dict of categories -> indexers + + Example + ------- + In [1]: c = pd.Categorical(list('aabca')) + + In [2]: c + Out[2]: + [a, a, b, c, a] + Categories (3, object): [a, b, c] + + In [3]: c.categories + Out[3]: Index([u'a', u'b', u'c'], dtype='object') + + In [4]: c.codes + Out[4]: array([0, 0, 1, 2, 0], dtype=int8) + + In [5]: c._reverse_indexer() + Out[5]: {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} + + """ + categories = self.categories + r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'), + categories.size) + counts = counts.cumsum() + result = [r[counts[indexer]:counts[indexer + 1]] + for indexer in range(len(counts) - 1)] + result = dict(zip(categories, result)) + return result + + # reduction ops # + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + """ perform the reduction type operation """ + func = getattr(self, name, None) + if func is None: + msg = 'Categorical cannot perform the operation {op}' + raise TypeError(msg.format(op=name)) + return func(numeric_only=numeric_only, **kwds) + + def min(self, numeric_only=None, **kwargs): + """ The minimum value of the object. + + Only ordered `Categoricals` have a minimum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + min : the minimum of this `Categorical` + """ + self.check_for_ordered('min') + if numeric_only: + good = self._codes != -1 + pointer = self._codes[good].min(**kwargs) + else: + pointer = self._codes.min(**kwargs) + if pointer == -1: + return np.nan + else: + return self.categories[pointer] + + def max(self, numeric_only=None, **kwargs): + """ The maximum value of the object. + + Only ordered `Categoricals` have a maximum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + max : the maximum of this `Categorical` + """ + self.check_for_ordered('max') + if numeric_only: + good = self._codes != -1 + pointer = self._codes[good].max(**kwargs) + else: + pointer = self._codes.max(**kwargs) + if pointer == -1: + return np.nan + else: + return self.categories[pointer] + + def mode(self): + """ + Returns the mode(s) of the Categorical. + + Always returns `Categorical` even if only one value. + + Returns + ------- + modes : `Categorical` (sorted) + """ + + import pandas._libs.hashtable as htable + good = self._codes != -1 + values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) + result = self._constructor(values=values, categories=self.categories, + ordered=self.ordered, fastpath=True) + return result + + def unique(self): + """ + Return the ``Categorical`` which ``categories`` and ``codes`` are + unique. Unused categories are NOT returned. + + - unordered category: values and categories are sorted by appearance + order. + - ordered category: values are sorted by appearance order, categories + keeps existing order. + + Returns + ------- + unique values : ``Categorical`` + + Examples + -------- + An unordered Categorical will return categories in the + order of appearance. + + >>> pd.Categorical(list('baabc')) + [b, a, c] + Categories (3, object): [b, a, c] + + >>> pd.Categorical(list('baabc'), categories=list('abc')) + [b, a, c] + Categories (3, object): [b, a, c] + + An ordered Categorical preserves the category ordering. + + >>> pd.Categorical(list('baabc'), + ... categories=list('abc'), + ... ordered=True) + [b, a, c] + Categories (3, object): [a < b < c] + + See Also + -------- + unique + CategoricalIndex.unique + Series.unique + + """ + + # unlike np.unique, unique1d does not sort + unique_codes = unique1d(self.codes) + cat = self.copy() + + # keep nan in codes + cat._codes = unique_codes + + # exclude nan from indexer for categories + take_codes = unique_codes[unique_codes != -1] + if self.ordered: + take_codes = sorted(take_codes) + return cat.set_categories(cat.categories.take(take_codes)) + + def equals(self, other): + """ + Returns True if categorical arrays are equal. + + Parameters + ---------- + other : `Categorical` + + Returns + ------- + are_equal : boolean + """ + if self.is_dtype_equal(other): + if self.categories.equals(other.categories): + # fastpath to avoid re-coding + other_codes = other._codes + else: + other_codes = _recode_for_categories(other.codes, + other.categories, + self.categories) + return np.array_equal(self._codes, other_codes) + return False + + def is_dtype_equal(self, other): + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + + try: + return hash(self.dtype) == hash(other.dtype) + except (AttributeError, TypeError): + return False + + def describe(self): + """ Describes this Categorical + + Returns + ------- + description: `DataFrame` + A dataframe with frequency and counts by category. + """ + counts = self.value_counts(dropna=False) + freqs = counts / float(counts.sum()) + + from pandas.core.reshape.concat import concat + result = concat([counts, freqs], axis=1) + result.columns = ['counts', 'freqs'] + result.index.name = 'categories' + + return result + + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of a Categorical. + + See also + -------- + numpy.ndarray.repeat + + """ + nv.validate_repeat(args, kwargs) + codes = self._codes.repeat(repeats) + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) + +# The Series.cat accessor + + +class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): + """ + Accessor object for categorical properties of the Series values. + + Be aware that assigning to `categories` is a inplace operation, while all + methods return new categorical data per default (but can be called with + `inplace=True`). + + Parameters + ---------- + data : Series or CategoricalIndex + + Examples + -------- + >>> s.cat.categories + >>> s.cat.categories = list('abc') + >>> s.cat.rename_categories(list('cab')) + >>> s.cat.reorder_categories(list('cab')) + >>> s.cat.add_categories(['d','e']) + >>> s.cat.remove_categories(['d']) + >>> s.cat.remove_unused_categories() + >>> s.cat.set_categories(list('abcde')) + >>> s.cat.as_ordered() + >>> s.cat.as_unordered() + + """ + + def __init__(self, data): + self._validate(data) + self.categorical = data.values + self.index = data.index + self.name = data.name + self._freeze() + + @staticmethod + def _validate(data): + if not is_categorical_dtype(data.dtype): + raise AttributeError("Can only use .cat accessor with a " + "'category' dtype") + + def _delegate_property_get(self, name): + return getattr(self.categorical, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self.categorical, name, new_values) + + @property + def codes(self): + from pandas import Series + return Series(self.categorical.codes, index=self.index) + + def _delegate_method(self, name, *args, **kwargs): + from pandas import Series + method = getattr(self.categorical, name) + res = method(*args, **kwargs) + if res is not None: + return Series(res, index=self.index, name=self.name) + + +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, + accessors=["categories", + "ordered"], + typ='property') +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=[ + "rename_categories", "reorder_categories", "add_categories", + "remove_categories", "remove_unused_categories", "set_categories", + "as_ordered", "as_unordered"], typ='method') + +# utility routines + + +def _get_codes_for_values(values, categories): + """ + utility routine to turn values into codes given the specified categories + """ + + from pandas.core.algorithms import _get_data_algo, _hashtables + if not is_dtype_equal(values.dtype, categories.dtype): + values = _ensure_object(values) + categories = _ensure_object(categories) + + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) + (_, _), cats = _get_data_algo(categories, _hashtables) + t = hash_klass(len(cats)) + t.map_locations(cats) + return coerce_indexer_dtype(t.lookup(vals), cats) + + +def _recode_for_categories(codes, old_categories, new_categories): + """ + Convert a set of codes for to a new set of categories + + Parameters + ---------- + codes : array + old_categories, new_categories : Index + + Returns + ------- + new_codes : array + + Examples + -------- + >>> old_cat = pd.Index(['b', 'a', 'c']) + >>> new_cat = pd.Index(['a', 'b']) + >>> codes = np.array([0, 1, 1, 2]) + >>> _recode_for_categories(codes, old_cat, new_cat) + array([ 1, 0, 0, -1]) + """ + from pandas.core.algorithms import take_1d + + if len(old_categories) == 0: + # All null anyway, so just retain the nulls + return codes.copy() + indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), + new_categories) + new_codes = take_1d(indexer, codes.copy(), fill_value=-1) + return new_codes + + +def _convert_to_list_like(list_like): + if hasattr(list_like, "dtype"): + return list_like + if isinstance(list_like, list): + return list_like + if (is_sequence(list_like) or isinstance(list_like, tuple) or + isinstance(list_like, types.GeneratorType)): + return list(list_like) + elif is_scalar(list_like): + return [list_like] + else: + # is this reached? + return [list_like] + + +def _factorize_from_iterable(values): + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + *This is an internal function* + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : ndarray + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + from pandas.core.indexes.category import CategoricalIndex + + if not is_list_like(values): + raise TypeError("Input must be list-like") + + if is_categorical(values): + if isinstance(values, (ABCCategoricalIndex, ABCSeries)): + values = values._values + categories = CategoricalIndex(values.categories, + categories=values.categories, + ordered=values.ordered) + codes = values.codes + else: + cat = Categorical(values, ordered=True) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def _factorize_from_iterables(iterables): + """ + A higher-level wrapper over `_factorize_from_iterable`. + + *This is an internal function* + + Parameters + ---------- + iterables : list-like of list-likes + + Returns + ------- + codes_list : list of ndarrays + categories_list : list of Indexes + + Notes + ----- + See `_factorize_from_iterable` for more info. + """ + if len(iterables) == 0: + # For consistency, it should return a list of 2 lists. + return [[], []] + return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 7b11e37a14b512..17435dfc48bde1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1,2331 +1,7 @@ -# pylint: disable=E1101,W0232 +import warnings -import numpy as np -from warnings import warn -import types +# TODO: Remove after 0.23.x +warnings.warn("'pandas.core' is private. Use 'pandas.Categorical'", + FutureWarning, stacklevel=2) -from pandas import compat -from pandas.compat import u, lzip -from pandas._libs import lib, algos as libalgos - -from pandas.core.dtypes.generic import ( - ABCSeries, ABCIndexClass, ABCCategoricalIndex) -from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.cast import ( - maybe_infer_to_datetimelike, - coerce_indexer_dtype) -from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import ( - _ensure_int64, - _ensure_object, - _ensure_platform_int, - is_dtype_equal, - is_datetimelike, - is_datetime64_dtype, - is_timedelta64_dtype, - is_categorical, - is_categorical_dtype, - is_list_like, is_sequence, - is_scalar, - is_dict_like) -from pandas.core.common import is_null_slice, _maybe_box_datetimelike - -from pandas.core.algorithms import factorize, take_1d, unique1d -from pandas.core.accessor import PandasDelegate -from pandas.core.base import (PandasObject, - NoNewAttributesMixin, _shared_docs) -import pandas.core.common as com -from pandas.core.missing import interpolate_2d -from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, cache_readonly, deprecate_kwarg, Substitution) - -from pandas.io.formats.terminal import get_terminal_size -from pandas.util._validators import validate_bool_kwarg -from pandas.core.config import get_option - - -def _cat_compare_op(op): - def f(self, other): - # On python2, you can usually compare any type to any type, and - # Categoricals can be seen as a custom type, but having different - # results depending whether categories are the same or not is kind of - # insane, so be a bit stricter here and use the python3 idea of - # comparing only things of equal type. - if not self.ordered: - if op in ['__lt__', '__gt__', '__le__', '__ge__']: - raise TypeError("Unordered Categoricals can only compare " - "equality or not") - if isinstance(other, Categorical): - # Two Categoricals can only be be compared if the categories are - # the same (maybe up to ordering, depending on ordered) - - msg = ("Categoricals can only be compared if " - "'categories' are the same.") - if len(self.categories) != len(other.categories): - raise TypeError(msg + " Categories are different lengths") - elif (self.ordered and not (self.categories == - other.categories).all()): - raise TypeError(msg) - elif not set(self.categories) == set(other.categories): - raise TypeError(msg) - - if not (self.ordered == other.ordered): - raise TypeError("Categoricals can only be compared if " - "'ordered' is the same") - if not self.ordered and not self.categories.equals( - other.categories): - # both unordered and different order - other_codes = _get_codes_for_values(other, self.categories) - else: - other_codes = other._codes - - na_mask = (self._codes == -1) | (other_codes == -1) - f = getattr(self._codes, op) - ret = f(other_codes) - if na_mask.any(): - # In other series, the leads to False, so do that here too - ret[na_mask] = False - return ret - - # Numpy-1.9 and earlier may convert a scalar to a zerodim array during - # comparison operation when second arg has higher priority, e.g. - # - # cat[0] < cat - # - # With cat[0], for example, being ``np.int64(1)`` by the time it gets - # into this function would become ``np.array(1)``. - other = lib.item_from_zerodim(other) - if is_scalar(other): - if other in self.categories: - i = self.categories.get_loc(other) - return getattr(self._codes, op)(i) - else: - if op == '__eq__': - return np.repeat(False, len(self)) - elif op == '__ne__': - return np.repeat(True, len(self)) - else: - msg = ("Cannot compare a Categorical for op {op} with a " - "scalar, which is not a category.") - raise TypeError(msg.format(op=op)) - else: - - # allow categorical vs object dtype array comparisons for equality - # these are only positional comparisons - if op in ['__eq__', '__ne__']: - return getattr(np.array(self), op)(np.array(other)) - - msg = ("Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'.") - raise TypeError(msg.format(op=op, typ=type(other))) - - f.__name__ = op - - return f - - -def _maybe_to_categorical(array): - """ - Coerce to a categorical if a series is given. - - Internal use ONLY. - """ - if isinstance(array, (ABCSeries, ABCCategoricalIndex)): - return array._values - elif isinstance(array, np.ndarray): - return Categorical(array) - return array - - -_codes_doc = """The category codes of this categorical. - -Level codes are an array if integer which are the positions of the real -values in the categories array. - -There is not setter, use the other categorical methods and the normal item -setter to change values in the categorical. -""" - - -class Categorical(PandasObject): - """ - Represents a categorical variable in classic R / S-plus fashion - - `Categoricals` can only take on only a limited, and usually fixed, number - of possible values (`categories`). In contrast to statistical categorical - variables, a `Categorical` might have an order, but numerical operations - (additions, divisions, ...) are not possible. - - All values of the `Categorical` are either in `categories` or `np.nan`. - Assigning values outside of `categories` will raise a `ValueError`. Order - is defined by the order of the `categories`, not lexical order of the - values. - - Parameters - ---------- - values : list-like - The values of the categorical. If categories are given, values not in - categories will be replaced with NaN. - categories : Index-like (unique), optional - The unique categories for this categorical. If not given, the - categories are assumed to be the unique values of values. - ordered : boolean, (default False) - Whether or not this categorical is treated as a ordered categorical. - If not given, the resulting categorical will not be ordered. - dtype : CategoricalDtype - An instance of ``CategoricalDtype`` to use for this categorical - - .. versionadded:: 0.21.0 - - Attributes - ---------- - categories : Index - The categories of this categorical - codes : ndarray - The codes (integer positions, which point to the categories) of this - categorical, read only. - ordered : boolean - Whether or not this Categorical is ordered. - dtype : CategoricalDtype - The instance of ``CategoricalDtype`` storing the ``categories`` - and ``ordered``. - - .. versionadded:: 0.21.0 - - Methods - ------- - from_codes - __array__ - - Raises - ------ - ValueError - If the categories do not validate. - TypeError - If an explicit ``ordered=True`` is given but no `categories` and the - `values` are not sortable. - - Examples - -------- - >>> pd.Categorical([1, 2, 3, 1, 2, 3]) - [1, 2, 3, 1, 2, 3] - Categories (3, int64): [1, 2, 3] - - >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) - [a, b, c, a, b, c] - Categories (3, object): [a, b, c] - - Ordered `Categoricals` can be sorted according to the custom order - of the categories and can have a min and max value. - - >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True, - ... categories=['c', 'b', 'a']) - >>> c - [a, b, c, a, b, c] - Categories (3, object): [c < b < a] - >>> c.min() - 'c' - - Notes - ----- - See the `user guide - `_ for more. - - See also - -------- - pandas.api.types.CategoricalDtype : Type for categorical data - CategoricalIndex : An Index with an underlying ``Categorical`` - """ - - # For comparisons, so that numpy uses our implementation if the compare - # ops, which raise - __array_priority__ = 1000 - _dtype = CategoricalDtype() - _deprecations = frozenset(['labels']) - _typ = 'categorical' - - def __init__(self, values, categories=None, ordered=None, dtype=None, - fastpath=False): - - # Ways of specifying the dtype (prioritized ordered) - # 1. dtype is a CategoricalDtype - # a.) with known categories, use dtype.categories - # b.) else with Categorical values, use values.dtype - # c.) else, infer from values - # d.) specifying dtype=CategoricalDtype and categories is an error - # 2. dtype is a string 'category' - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - # 3. dtype is None - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - - if dtype is not None: - # The dtype argument takes precedence over values.dtype (if any) - if isinstance(dtype, compat.string_types): - if dtype == 'category': - dtype = CategoricalDtype(categories, ordered) - else: - msg = "Unknown `dtype` {dtype}" - raise ValueError(msg.format(dtype=dtype)) - elif categories is not None or ordered is not None: - raise ValueError("Cannot specify both `dtype` and `categories`" - " or `ordered`.") - - categories = dtype.categories - ordered = dtype.ordered - - elif is_categorical(values): - # If no "dtype" was passed, use the one from "values", but honor - # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) - else: - # If dtype=None and values is not categorical, create a new dtype - dtype = CategoricalDtype(categories, ordered) - - # At this point, dtype is always a CategoricalDtype - # if dtype.categories is None, we are inferring - - if fastpath: - self._codes = coerce_indexer_dtype(values, categories) - self._dtype = dtype - return - - # null_mask indicates missing values we want to exclude from inference. - # This means: only missing values in list-likes (not arrays/ndframes). - null_mask = np.array(False) - - # sanitize input - if is_categorical_dtype(values): - if dtype.categories is None: - dtype = CategoricalDtype(values.categories, dtype.ordered) - - elif not isinstance(values, (ABCIndexClass, ABCSeries)): - # _sanitize_array coerces np.nan to a string under certain versions - # of numpy - values = maybe_infer_to_datetimelike(values, convert_dates=True) - if not isinstance(values, np.ndarray): - values = _convert_to_list_like(values) - from pandas.core.series import _sanitize_array - # By convention, empty lists result in object dtype: - if len(values) == 0: - sanitize_dtype = 'object' - else: - sanitize_dtype = None - null_mask = isna(values) - if null_mask.any(): - values = [values[idx] for idx in np.where(~null_mask)[0]] - values = _sanitize_array(values, None, dtype=sanitize_dtype) - - if dtype.categories is None: - try: - codes, categories = factorize(values, sort=True) - except TypeError: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError("'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument.") - except ValueError: - - # FIXME - raise NotImplementedError("> 1 ndim Categorical are not " - "supported at this time") - - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) - - elif is_categorical_dtype(values): - old_codes = (values.cat.codes if isinstance(values, ABCSeries) - else values.codes) - codes = _recode_for_categories(old_codes, values.dtype.categories, - dtype.categories) - - else: - codes = _get_codes_for_values(values, dtype.categories) - - if null_mask.any(): - # Reinsert -1 placeholders for previously removed missing values - full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) - full_codes[~null_mask] = codes - codes = full_codes - - self._dtype = dtype - self._codes = coerce_indexer_dtype(codes, dtype.categories) - - @property - def categories(self): - """The categories of this categorical. - - Setting assigns new values to each category (effectively a rename of - each individual category). - - The assigned value has to be a list-like object. All items must be - unique and the number of items in the new categories must be the same - as the number of items in the old categories. - - Assigning to `categories` is a inplace operation! - - Raises - ------ - ValueError - If the new categories do not validate as categories or if the - number of new categories is unequal the number of old categories - - See also - -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories - """ - return self.dtype.categories - - @categories.setter - def categories(self, categories): - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (self.dtype.categories is not None and - len(self.dtype.categories) != len(new_dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items as the old categories!") - self._dtype = new_dtype - - @property - def ordered(self): - """Whether the categories have an ordered relationship""" - return self.dtype.ordered - - @property - def dtype(self): - """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" - return self._dtype - - @property - def _constructor(self): - return Categorical - - def copy(self): - """ Copy constructor. """ - return self._constructor(values=self._codes.copy(), - categories=self.categories, - ordered=self.ordered, - fastpath=True) - - def astype(self, dtype, copy=True): - """ - Coerce this type to another dtype - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and dtype is categorical, the original - object is returned. - - .. versionadded:: 0.19.0 - - """ - if is_categorical_dtype(dtype): - # GH 10696/18593 - dtype = self.dtype._update_dtype(dtype) - self = self.copy() if copy else self - if dtype == self.dtype: - return self - return self._set_dtype(dtype) - return np.array(self, dtype=dtype, copy=copy) - - @cache_readonly - def ndim(self): - """Number of dimensions of the Categorical """ - return self._codes.ndim - - @cache_readonly - def size(self): - """ return the len of myself """ - return len(self) - - @cache_readonly - def itemsize(self): - """ return the size of a single category """ - return self.categories.itemsize - - def tolist(self): - """ - Return a list of the values. - - These are each a scalar type, which is a Python scalar - (for str, int, float) or a pandas scalar - (for Timestamp/Timedelta/Interval/Period) - """ - if is_datetimelike(self.categories): - return [_maybe_box_datetimelike(x) for x in self] - return np.array(self).tolist() - - @property - def base(self): - """ compat, we are always our own object """ - return None - - @classmethod - def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype): - """Construct a Categorical from inferred values - - For inferred categories (`dtype` is None) the categories are sorted. - For explicit `dtype`, the `inferred_categories` are cast to the - appropriate type. - - Parameters - ---------- - - inferred_categories : Index - inferred_codes : Index - dtype : CategoricalDtype or 'category' - - Returns - ------- - Categorical - """ - from pandas import Index, to_numeric, to_datetime, to_timedelta - - cats = Index(inferred_categories) - - known_categories = (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None) - - if known_categories: - # Convert to a specialzed type with `dtype` if specified - if dtype.categories.is_numeric(): - cats = to_numeric(inferred_categories, errors='coerce') - elif is_datetime64_dtype(dtype.categories): - cats = to_datetime(inferred_categories, errors='coerce') - elif is_timedelta64_dtype(dtype.categories): - cats = to_timedelta(inferred_categories, errors='coerce') - - if known_categories: - # recode from observation oder to dtype.categories order - categories = dtype.categories - codes = _recode_for_categories(inferred_codes, cats, categories) - elif not cats.is_monotonic_increasing: - # sort categories and recode for unknown categories - unsorted = cats.copy() - categories = cats.sort_values() - codes = _recode_for_categories(inferred_codes, unsorted, - categories) - dtype = CategoricalDtype(categories, ordered=False) - else: - dtype = CategoricalDtype(cats, ordered=False) - codes = inferred_codes - - return cls(codes, dtype=dtype, fastpath=True) - - @classmethod - def from_codes(cls, codes, categories, ordered=False): - """ - Make a Categorical type from codes and categories arrays. - - This constructor is useful if you already have codes and categories and - so do not need the (computation intensive) factorization step, which is - usually done on the constructor. - - If your data does not follow this convention, please use the normal - constructor. - - Parameters - ---------- - codes : array-like, integers - An integer array, where each integer points to a category in - categories or -1 for NaN - categories : index-like - The categories for the categorical. Items need to be unique. - ordered : boolean, (default False) - Whether or not this categorical is treated as a ordered - categorical. If not given, the resulting categorical will be - unordered. - """ - try: - codes = np.asarray(codes, np.int64) - except: - raise ValueError( - "codes need to be convertible to an arrays of integers") - - categories = CategoricalDtype._validate_categories(categories) - - if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and " - "len(categories)-1") - - return cls(codes, categories=categories, ordered=ordered, - fastpath=True) - - _codes = None - - def _get_codes(self): - """ Get the codes. - - Returns - ------- - codes : integer array view - A non writable view of the `codes` array. - """ - v = self._codes.view() - v.flags.writeable = False - return v - - def _set_codes(self, codes): - """ - Not settable by the user directly - """ - raise ValueError("cannot set Categorical codes directly") - - codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) - - def _set_categories(self, categories, fastpath=False): - """ Sets new categories inplace - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - Examples - -------- - >>> c = Categorical(['a', 'b']) - >>> c - [a, b] - Categories (2, object): [a, b] - - >>> c._set_categories(pd.Index(['a', 'c'])) - >>> c - [a, c] - Categories (2, object): [a, c] - """ - - if fastpath: - new_dtype = CategoricalDtype._from_fastpath(categories, - self.ordered) - else: - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (not fastpath and self.dtype.categories is not None and - len(new_dtype.categories) != len(self.dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items than the old categories!") - - self._dtype = new_dtype - - def _codes_for_groupby(self, sort): - """ - If sort=False, return a copy of self, coded with categories as - returned by .unique(), followed by any categories not appearing in - the data. If sort=True, return self. - - This method is needed solely to ensure the categorical index of the - GroupBy result has categories in the order of appearance in the data - (GH-8868). - - Parameters - ---------- - sort : boolean - The value of the sort parameter groupby was called with. - - Returns - ------- - Categorical - If sort=False, the new categories are set to the order of - appearance in codes (unless ordered=True, in which case the - original order is preserved), followed by any unrepresented - categories in the original order. - """ - - # Already sorted according to self.categories; all is fine - if sort: - return self - - # sort=False should order groups in as-encountered order (GH-8868) - cat = self.unique() - - # But for groupby to work, all categories should be present, - # including those missing from the data (GH-13179), which .unique() - # above dropped - cat.add_categories( - self.categories[~self.categories.isin(cat.categories)], - inplace=True) - - return self.reorder_categories(cat.categories) - - def _set_dtype(self, dtype): - """Internal method for directly updating the CategoricalDtype - - Parameters - ---------- - dtype : CategoricalDtype - - Notes - ----- - We don't do any validation here. It's assumed that the dtype is - a (valid) instance of `CategoricalDtype`. - """ - codes = _recode_for_categories(self.codes, self.categories, - dtype.categories) - return type(self)(codes, dtype=dtype, fastpath=True) - - def set_ordered(self, value, inplace=False): - """ - Sets the ordered attribute to the boolean value - - Parameters - ---------- - value : boolean to set whether this categorical is ordered (True) or - not (False) - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to the value - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - new_dtype = CategoricalDtype(self.categories, ordered=value) - cat = self if inplace else self.copy() - cat._dtype = new_dtype - if not inplace: - return cat - - def as_ordered(self, inplace=False): - """ - Sets the Categorical to be ordered - - Parameters - ---------- - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to True - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - return self.set_ordered(True, inplace=inplace) - - def as_unordered(self, inplace=False): - """ - Sets the Categorical to be unordered - - Parameters - ---------- - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to False - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - return self.set_ordered(False, inplace=inplace) - - def set_categories(self, new_categories, ordered=None, rename=False, - inplace=False): - """ Sets the categories to the specified new_categories. - - `new_categories` can include new categories (which will result in - unused categories) or remove old categories (which results in values - set to NaN). If `rename==True`, the categories will simple be renamed - (less or more items than in old categories will result in values set to - NaN or in unused categories respectively). - - This method can be used to perform more than one action of adding, - removing, and reordering simultaneously and is therefore faster than - performing the individual steps via the more specialised methods. - - On the other hand this methods does not do checks (e.g., whether the - old categories are included in the new categories on a reorder), which - can result in surprising changes, for example when using special string - dtypes on python3, which does not considers a S1 string equal to a - single char python string. - - Raises - ------ - ValueError - If new_categories does not validate as categories - - Parameters - ---------- - new_categories : Index-like - The categories in new order. - ordered : boolean, (default: False) - Whether or not the categorical is treated as a ordered categorical. - If not given, do not change the ordered information. - rename : boolean (default: False) - Whether or not the new_categories should be considered as a rename - of the old categories or as reordered categories. - inplace : boolean (default: False) - Whether or not to reorder the categories inplace or return a copy of - this categorical with reordered categories. - - Returns - ------- - cat : Categorical with reordered categories or None if inplace. - - See also - -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if ordered is None: - ordered = self.dtype.ordered - new_dtype = CategoricalDtype(new_categories, ordered=ordered) - - cat = self if inplace else self.copy() - if rename: - if (cat.dtype.categories is not None and - len(new_dtype.categories) < len(cat.dtype.categories)): - # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_dtype.categories)] = -1 - else: - codes = _recode_for_categories(self.codes, self.categories, - new_dtype.categories) - cat._codes = codes - cat._dtype = new_dtype - - if not inplace: - return cat - - def rename_categories(self, new_categories, inplace=False): - """ Renames categories. - - Raises - ------ - ValueError - If new categories are list-like and do not have the same number of - items than the current categories or do not validate as categories - - Parameters - ---------- - new_categories : list-like, dict-like or callable - - * list-like: all items must be unique and the number of items in - the new categories must match the existing number of categories. - - * dict-like: specifies a mapping from - old categories to new. Categories not contained in the mapping - are passed through and extra categories in the mapping are - ignored. - - .. versionadded:: 0.21.0 - - * callable : a callable that is called on all items in the old - categories and whose return values comprise the new categories. - - .. versionadded:: 0.23.0 - - .. warning:: - - Currently, Series are considered list like. In a future version - of pandas they'll be considered dict-like. - - inplace : boolean (default: False) - Whether or not to rename the categories inplace or return a copy of - this categorical with renamed categories. - - Returns - ------- - cat : Categorical or None - With ``inplace=False``, the new categorical is returned. - With ``inplace=True``, there is no return value. - - See also - -------- - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories - - Examples - -------- - >>> c = Categorical(['a', 'a', 'b']) - >>> c.rename_categories([0, 1]) - [0, 0, 1] - Categories (2, int64): [0, 1] - - For dict-like ``new_categories``, extra keys are ignored and - categories not in the dictionary are passed through - - >>> c.rename_categories({'a': 'A', 'c': 'C'}) - [A, A, b] - Categories (2, object): [A, b] - - You may also provide a callable to create the new categories - - >>> c.rename_categories(lambda x: x.upper()) - [A, A, B] - Categories (2, object): [A, B] - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - cat = self if inplace else self.copy() - - if isinstance(new_categories, ABCSeries): - msg = ("Treating Series 'new_categories' as a list-like and using " - "the values. In a future version, 'rename_categories' will " - "treat Series like a dictionary.\n" - "For dict-like, use 'new_categories.to_dict()'\n" - "For list-like, use 'new_categories.values'.") - warn(msg, FutureWarning, stacklevel=2) - new_categories = list(new_categories) - - if is_dict_like(new_categories): - cat.categories = [new_categories.get(item, item) - for item in cat.categories] - elif callable(new_categories): - cat.categories = [new_categories(item) for item in cat.categories] - else: - cat.categories = new_categories - if not inplace: - return cat - - def reorder_categories(self, new_categories, ordered=None, inplace=False): - """ Reorders categories as specified in new_categories. - - `new_categories` need to include all old categories and no new category - items. - - Raises - ------ - ValueError - If the new categories do not contain all old category items or any - new ones - - Parameters - ---------- - new_categories : Index-like - The categories in new order. - ordered : boolean, optional - Whether or not the categorical is treated as a ordered categorical. - If not given, do not change the ordered information. - inplace : boolean (default: False) - Whether or not to reorder the categories inplace or return a copy of - this categorical with reordered categories. - - Returns - ------- - cat : Categorical with reordered categories or None if inplace. - - See also - -------- - rename_categories - add_categories - remove_categories - remove_unused_categories - set_categories - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self.dtype.categories) != set(new_categories): - raise ValueError("items in new_categories are not the same as in " - "old categories") - return self.set_categories(new_categories, ordered=ordered, - inplace=inplace) - - def add_categories(self, new_categories, inplace=False): - """ Add new categories. - - `new_categories` will be included at the last/highest place in the - categories and will be unused directly after this call. - - Raises - ------ - ValueError - If the new categories include old categories or do not validate as - categories - - Parameters - ---------- - new_categories : category or list-like of category - The new categories to be included. - inplace : boolean (default: False) - Whether or not to add the categories inplace or return a copy of - this categorical with added categories. - - Returns - ------- - cat : Categorical with new categories added or None if inplace. - - See also - -------- - rename_categories - reorder_categories - remove_categories - remove_unused_categories - set_categories - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if not is_list_like(new_categories): - new_categories = [new_categories] - already_included = set(new_categories) & set(self.dtype.categories) - if len(already_included) != 0: - msg = ("new categories must not include old categories: " - "{already_included!s}") - raise ValueError(msg.format(already_included=already_included)) - new_categories = list(self.dtype.categories) + list(new_categories) - new_dtype = CategoricalDtype(new_categories, self.ordered) - - cat = self if inplace else self.copy() - cat._dtype = new_dtype - cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) - if not inplace: - return cat - - def remove_categories(self, removals, inplace=False): - """ Removes the specified categories. - - `removals` must be included in the old categories. Values which were in - the removed categories will be set to NaN - - Raises - ------ - ValueError - If the removals are not contained in the categories - - Parameters - ---------- - removals : category or list of categories - The categories which should be removed. - inplace : boolean (default: False) - Whether or not to remove the categories inplace or return a copy of - this categorical with removed categories. - - Returns - ------- - cat : Categorical with removed categories or None if inplace. - - See also - -------- - rename_categories - reorder_categories - add_categories - remove_unused_categories - set_categories - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if not is_list_like(removals): - removals = [removals] - - removal_set = set(list(removals)) - not_included = removal_set - set(self.dtype.categories) - new_categories = [c for c in self.dtype.categories - if c not in removal_set] - - # GH 10156 - if any(isna(removals)): - not_included = [x for x in not_included if notna(x)] - new_categories = [x for x in new_categories if notna(x)] - - if len(not_included) != 0: - msg = "removals must all be in old categories: {not_included!s}" - raise ValueError(msg.format(not_included=not_included)) - - return self.set_categories(new_categories, ordered=self.ordered, - rename=False, inplace=inplace) - - def remove_unused_categories(self, inplace=False): - """ Removes categories which are not used. - - Parameters - ---------- - inplace : boolean (default: False) - Whether or not to drop unused categories inplace or return a copy of - this categorical with unused categories dropped. - - Returns - ------- - cat : Categorical with unused categories dropped or None if inplace. - - See also - -------- - rename_categories - reorder_categories - add_categories - remove_categories - set_categories - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - cat = self if inplace else self.copy() - idx, inv = np.unique(cat._codes, return_inverse=True) - - if idx.size != 0 and idx[0] == -1: # na sentinel - idx, inv = idx[1:], inv - 1 - - new_categories = cat.dtype.categories.take(idx) - new_dtype = CategoricalDtype._from_fastpath(new_categories, - ordered=self.ordered) - cat._dtype = new_dtype - cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) - - if not inplace: - return cat - - def map(self, mapper): - """Apply mapper function to its categories (not codes). - - Parameters - ---------- - mapper : callable - Function to be applied. When all categories are mapped - to different categories, the result will be Categorical which has - the same order property as the original. Otherwise, the result will - be np.ndarray. - - Returns - ------- - applied : Categorical or Index. - - """ - new_categories = self.categories.map(mapper) - try: - return self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) - except ValueError: - return np.take(new_categories, self._codes) - - __eq__ = _cat_compare_op('__eq__') - __ne__ = _cat_compare_op('__ne__') - __lt__ = _cat_compare_op('__lt__') - __gt__ = _cat_compare_op('__gt__') - __le__ = _cat_compare_op('__le__') - __ge__ = _cat_compare_op('__ge__') - - # for Series/ndarray like compat - @property - def shape(self): - """ Shape of the Categorical. - - For internal compatibility with numpy arrays. - - Returns - ------- - shape : tuple - """ - - return tuple([len(self._codes)]) - - def shift(self, periods): - """ - Shift Categorical by desired number of periods. - - Parameters - ---------- - periods : int - Number of periods to move, can be positive or negative - - Returns - ------- - shifted : Categorical - """ - # since categoricals always have ndim == 1, an axis parameter - # doesn't make any sense here. - codes = self.codes - if codes.ndim > 1: - raise NotImplementedError("Categorical with ndim > 1.") - if np.prod(codes.shape) and (periods != 0): - codes = np.roll(codes, _ensure_platform_int(periods), axis=0) - if periods > 0: - codes[:periods] = -1 - else: - codes[periods:] = -1 - - return self.from_codes(codes, categories=self.categories, - ordered=self.ordered) - - def __array__(self, dtype=None): - """ - The numpy array interface. - - Returns - ------- - values : numpy array - A numpy array of either the specified dtype or, - if dtype==None (default), the same dtype as - categorical.categories.dtype - """ - ret = take_1d(self.categories.values, self._codes) - if dtype and not is_dtype_equal(dtype, self.categories.dtype): - return np.asarray(ret, dtype) - return ret - - def __setstate__(self, state): - """Necessary for making this object picklable""" - if not isinstance(state, dict): - raise Exception('invalid pickle state') - - # Provide compatibility with pre-0.15.0 Categoricals. - if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype._validate_categories(state.pop( - '_levels')) - if '_codes' not in state and 'labels' in state: - state['_codes'] = coerce_indexer_dtype( - state.pop('labels'), state['_categories']) - - # 0.16.0 ordered change - if '_ordered' not in state: - - # >=15.0 < 0.16.0 - if 'ordered' in state: - state['_ordered'] = state.pop('ordered') - else: - state['_ordered'] = False - - # 0.21.0 CategoricalDtype change - if '_dtype' not in state: - state['_dtype'] = CategoricalDtype(state['_categories'], - state['_ordered']) - - for k, v in compat.iteritems(state): - setattr(self, k, v) - - @property - def T(self): - return self - - @property - def nbytes(self): - return self._codes.nbytes + self.dtype.categories.values.nbytes - - def memory_usage(self, deep=False): - """ - Memory usage of my values - - Parameters - ---------- - deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption - - Returns - ------- - bytes used - - Notes - ----- - Memory usage does not include memory consumed by elements that - are not components of the array if deep=False - - See Also - -------- - numpy.ndarray.nbytes - """ - return self._codes.nbytes + self.dtype.categories.memory_usage( - deep=deep) - - @Substitution(klass='Categorical') - @Appender(_shared_docs['searchsorted']) - @deprecate_kwarg(old_arg_name='v', new_arg_name='value') - def searchsorted(self, value, side='left', sorter=None): - if not self.ordered: - raise ValueError("Categorical not ordered\nyou can use " - ".as_ordered() to change the Categorical to an " - "ordered one") - - from pandas.core.series import Series - - values_as_codes = _get_codes_for_values(Series(value).values, - self.categories) - - if -1 in values_as_codes: - raise ValueError("Value(s) to be inserted must be in categories.") - - return self.codes.searchsorted(values_as_codes, side=side, - sorter=sorter) - - def isna(self): - """ - Detect missing values - - Both missing values (-1 in .codes) and NA as a category are detected. - - Returns - ------- - a boolean array of whether my values are null - - See also - -------- - isna : top-level isna - isnull : alias of isna - Categorical.notna : boolean inverse of Categorical.isna - - """ - - ret = self._codes == -1 - - # String/object and float categories can hold np.nan - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - ret = np.logical_or(ret, self._codes == nan_pos) - return ret - isnull = isna - - def notna(self): - """ - Inverse of isna - - Both missing values (-1 in .codes) and NA as a category are detected as - null. - - Returns - ------- - a boolean array of whether my values are not null - - See also - -------- - notna : top-level notna - notnull : alias of notna - Categorical.isna : boolean inverse of Categorical.notna - - """ - return ~self.isna() - notnull = notna - - def put(self, *args, **kwargs): - """ - Replace specific elements in the Categorical with given values. - """ - raise NotImplementedError(("'put' is not yet implemented " - "for Categorical")) - - def dropna(self): - """ - Return the Categorical without null values. - - Both missing values (-1 in .codes) and NA as a category are detected. - NA is removed from the categories if present. - - Returns - ------- - valid : Categorical - """ - result = self[self.notna()] - if isna(result.categories).any(): - result = result.remove_categories([np.nan]) - return result - - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : boolean, default True - Don't include counts of NaN, even if NaN is a category. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - from numpy import bincount - from pandas import isna, Series, CategoricalIndex - - obj = (self.remove_categories([np.nan]) if dropna and - isna(self.categories).any() else self) - code, cat = obj._codes, obj.categories - ncat, mask = len(cat), 0 <= code - ix, clean = np.arange(ncat), mask.all() - - if dropna or clean: - obs = code if clean else code[mask] - count = bincount(obs, minlength=ncat or None) - else: - count = bincount(np.where(mask, code, ncat)) - ix = np.append(ix, -1) - - ix = self._constructor(ix, dtype=self.dtype, - fastpath=True) - - return Series(count, index=CategoricalIndex(ix), dtype='int64') - - def get_values(self): - """ Return the values. - - For internal compatibility with pandas formatting. - - Returns - ------- - values : numpy array - A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods - """ - # if we are a datetime and period index, return Index to keep metadata - if is_datetimelike(self.categories): - return self.categories.take(self._codes, fill_value=np.nan) - return np.array(self) - - def check_for_ordered(self, op): - """ assert that we are ordered """ - if not self.ordered: - raise TypeError("Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op)) - - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """ - Returns the indices that would sort the Categorical instance if - 'sort_values' was called. This function is implemented to provide - compatibility with numpy ndarray objects. - - While an ordering is applied to the category values, arg-sorting - in this context refers more to organizing and grouping together - based on matching category values. Thus, this function can be - called on an unordered Categorical instance unlike the functions - 'Categorical.min' and 'Categorical.max'. - - Returns - ------- - argsorted : numpy array - - See also - -------- - numpy.ndarray.argsort - """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result - - def sort_values(self, inplace=False, ascending=True, na_position='last'): - """ Sorts the Categorical by category value returning a new - Categorical by default. - - While an ordering is applied to the category values, sorting in this - context refers more to organizing and grouping together based on - matching category values. Thus, this function can be called on an - unordered Categorical instance unlike the functions 'Categorical.min' - and 'Categorical.max'. - - Parameters - ---------- - inplace : boolean, default False - Do operation in place. - ascending : boolean, default True - Order ascending. Passing False orders descending. The - ordering parameter provides the method by which the - category values are organized. - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Returns - ------- - y : Categorical or None - - See Also - -------- - Categorical.sort - Series.sort_values - - Examples - -------- - >>> c = pd.Categorical([1, 2, 2, 1, 5]) - >>> c - [1, 2, 2, 1, 5] - Categories (3, int64): [1, 2, 5] - >>> c.sort_values() - [1, 1, 2, 2, 5] - Categories (3, int64): [1, 2, 5] - >>> c.sort_values(ascending=False) - [5, 2, 2, 1, 1] - Categories (3, int64): [1, 2, 5] - - Inplace sorting can be done as well: - - >>> c.sort_values(inplace=True) - >>> c - [1, 1, 2, 2, 5] - Categories (3, int64): [1, 2, 5] - >>> - >>> c = pd.Categorical([1, 2, 2, 1, 5]) - - 'sort_values' behaviour with NaNs. Note that 'na_position' - is independent of the 'ascending' parameter: - - >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) - >>> c - [NaN, 2.0, 2.0, NaN, 5.0] - Categories (2, int64): [2, 5] - >>> c.sort_values() - [2.0, 2.0, 5.0, NaN, NaN] - Categories (2, int64): [2, 5] - >>> c.sort_values(ascending=False) - [5.0, 2.0, 2.0, NaN, NaN] - Categories (2, int64): [2, 5] - >>> c.sort_values(na_position='first') - [NaN, NaN, 2.0, 2.0, 5.0] - Categories (2, int64): [2, 5] - >>> c.sort_values(ascending=False, na_position='first') - [NaN, NaN, 5.0, 2.0, 2.0] - Categories (2, int64): [2, 5] - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if na_position not in ['last', 'first']: - msg = 'invalid na_position: {na_position!r}' - raise ValueError(msg.format(na_position=na_position)) - - codes = np.sort(self._codes) - if not ascending: - codes = codes[::-1] - - # NaN handling - na_mask = (codes == -1) - if na_mask.any(): - n_nans = len(codes[na_mask]) - if na_position == "first": - # in this case sort to the front - new_codes = codes.copy() - new_codes[0:n_nans] = -1 - new_codes[n_nans:] = codes[~na_mask] - codes = new_codes - elif na_position == "last": - # ... and to the end - new_codes = codes.copy() - pos = len(codes) - n_nans - new_codes[0:pos] = codes[~na_mask] - new_codes[pos:] = -1 - codes = new_codes - if inplace: - self._codes = codes - return - else: - return self._constructor(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) - - def _values_for_rank(self): - """ - For correctly ranking ordered categorical data. See GH#15420 - - Ordered categorical data should be ranked on the basis of - codes with -1 translated to NaN. - - Returns - ------- - numpy array - - """ - from pandas import Series - if self.ordered: - values = self.codes - mask = values == -1 - if mask.any(): - values = values.astype('float64') - values[mask] = np.nan - elif self.categories.is_numeric(): - values = np.array(self) - else: - # reorder the categories (so rank can use the float codes) - # instead of passing an object array to rank - values = np.array( - self.rename_categories(Series(self.categories).rank().values) - ) - return values - - def ravel(self, order='C'): - """ Return a flattened (numpy) array. - - For internal compatibility with numpy arrays. - - Returns - ------- - raveled : numpy array - """ - return np.array(self) - - def view(self): - """Return a view of myself. - - For internal compatibility with numpy arrays. - - Returns - ------- - view : Categorical - Returns `self`! - """ - return self - - def to_dense(self): - """Return my 'dense' representation - - For internal compatibility with numpy arrays. - - Returns - ------- - dense : array - """ - return np.asarray(self) - - @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') - def fillna(self, value=None, method=None, limit=None): - """ Fill NA/NaN values using the specified method. - - Parameters - ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - value : scalar, dict, Series - If a scalar value is passed it is used to fill all missing values. - Alternatively, a Series or dict can be used to fill in different - values for each index. The value should not be a list. The - value(s) passed should either be in the categories or should be - NaN. - limit : int, default None - (Not implemented yet for Categorical!) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - Returns - ------- - filled : Categorical with NA/NaN filled - """ - - if value is None: - value = np.nan - if limit is not None: - raise NotImplementedError("specifying a limit for fillna has not " - "been implemented yet") - - values = self._codes - - # Make sure that we also get NA in categories - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - values = values.copy() - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - values[values == nan_pos] = -1 - - # pad / bfill - if method is not None: - - values = self.to_dense().reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None, - value).astype(self.categories.dtype)[0] - values = _get_codes_for_values(values, self.categories) - - else: - - # If value is a dict or a Series (a dict value has already - # been converted to a Series) - if isinstance(value, ABCSeries): - if not value[~value.isin(self.categories)].isna().all(): - raise ValueError("fill value must be in categories") - - values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(values_codes != -1) - values[indexer] = values_codes[values_codes != -1] - - # If value is not a dict or Series it should be a scalar - elif is_scalar(value): - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") - - mask = values == -1 - if mask.any(): - values = values.copy() - if isna(value): - values[mask] = -1 - else: - values[mask] = self.categories.get_loc(value) - - else: - raise TypeError('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a ' - '"{0}"'.format(type(value).__name__)) - - return self._constructor(values, categories=self.categories, - ordered=self.ordered, fastpath=True) - - def take_nd(self, indexer, allow_fill=True, fill_value=None): - """ Take the codes by the indexer, fill with the fill_value. - - For internal compatibility with numpy arrays. - """ - - # filling must always be None/nan here - # but is passed thru internally - assert isna(fill_value) - - codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = self._constructor(codes, categories=self.categories, - ordered=self.ordered, fastpath=True) - return result - - take = take_nd - - def _slice(self, slicer): - """ Return a slice of myself. - - For internal compatibility with numpy arrays. - """ - - # only allow 1 dimensional slicing, but can - # in a 2-d case be passd (slice(None),....) - if isinstance(slicer, tuple) and len(slicer) == 2: - if not is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") - slicer = slicer[1] - - _codes = self._codes[slicer] - return self._constructor(values=_codes, categories=self.categories, - ordered=self.ordered, fastpath=True) - - def __len__(self): - """The length of this Categorical.""" - return len(self._codes) - - def __iter__(self): - """Returns an Iterator over the values of this Categorical.""" - return iter(self.get_values()) - - def _tidy_repr(self, max_vals=10, footer=True): - """ a short repr displaying only max_vals and an optional (but default - footer) - """ - num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) - - result = u('{head}, ..., {tail}').format(head=head[:-1], tail=tail[1:]) - if footer: - result = u('{result}\n{footer}').format(result=result, - footer=self._repr_footer()) - - return compat.text_type(result) - - def _repr_categories(self): - """ return the base repr for the categories """ - max_categories = (10 if get_option("display.max_categories") == 0 else - get_option("display.max_categories")) - from pandas.io.formats import format as fmt - if len(self.categories) > max_categories: - num = max_categories // 2 - head = fmt.format_array(self.categories[:num], None) - tail = fmt.format_array(self.categories[-num:], None) - category_strs = head + ["..."] + tail - else: - category_strs = fmt.format_array(self.categories, None) - - # Strip all leading spaces, which format_array adds for columns... - category_strs = [x.strip() for x in category_strs] - return category_strs - - def _repr_categories_info(self): - """ Returns a string representation of the footer.""" - - category_strs = self._repr_categories() - dtype = getattr(self.categories, 'dtype_str', - str(self.categories.dtype)) - - levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype) - width, height = get_terminal_size() - max_width = get_option("display.width") or width - if com.in_ipython_frontend(): - # 0 = no breaks - max_width = 0 - levstring = "" - start = True - cur_col_len = len(levheader) # header - sep_len, sep = (3, " < ") if self.ordered else (2, ", ") - linesep = sep.rstrip() + "\n" # remove whitespace - for val in category_strs: - if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: - levstring += linesep + (" " * (len(levheader) + 1)) - cur_col_len = len(levheader) + 1 # header + a whitespace - elif not start: - levstring += sep - cur_col_len += len(val) - levstring += val - start = False - # replace to simple save space by - return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" - - def _repr_footer(self): - - return u('Length: {length}\n{info}').format( - length=len(self), info=self._repr_categories_info()) - - def _get_repr(self, length=True, na_rep='NaN', footer=True): - from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter(self, length=length, - na_rep=na_rep, footer=footer) - result = formatter.to_string() - return compat.text_type(result) - - def __unicode__(self): - """ Unicode representation. """ - _maxlen = 10 - if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) - elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) - else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = ('[], {repr_msg}'.format(repr_msg=msg)) - - return result - - def _maybe_coerce_indexer(self, indexer): - """ return an indexer coerced to the codes dtype """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': - indexer = indexer.astype(self._codes.dtype) - return indexer - - def __getitem__(self, key): - """ Return an item. """ - if isinstance(key, (int, np.integer)): - i = self._codes[key] - if i == -1: - return np.nan - else: - return self.categories[i] - else: - return self._constructor(values=self._codes[key], - categories=self.categories, - ordered=self.ordered, fastpath=True) - - def __setitem__(self, key, value): - """ Item assignment. - - - Raises - ------ - ValueError - If (one or more) Value is not in categories or if a assigned - `Categorical` does not have the same categories - """ - - # require identical categories set - if isinstance(value, Categorical): - if not value.categories.equals(self.categories): - raise ValueError("Cannot set a Categorical with another, " - "without identical categories") - - rvalue = value if is_list_like(value) else [value] - - from pandas import Index - to_add = Index(rvalue).difference(self.categories) - - # no assignments of values not in categories, but it's always ok to set - # something to np.nan - if len(to_add) and not isna(to_add).all(): - raise ValueError("Cannot setitem on a Categorical with a new " - "category, set the categories first") - - # set by position - if isinstance(key, (int, np.integer)): - pass - - # tuple of indexers (dataframe) - elif isinstance(key, tuple): - # only allow 1 dimensional slicing, but can - # in a 2-d case be passd (slice(None),....) - if len(key) == 2: - if not is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") - key = key[1] - elif len(key) == 1: - key = key[0] - else: - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") - - # slicing in Series or Categorical - elif isinstance(key, slice): - pass - - # Array of True/False in Series or Categorical - else: - # There is a bug in numpy, which does not accept a Series as a - # indexer - # https://github.com/pandas-dev/pandas/issues/6168 - # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 - # FIXME: remove when numpy 1.9 is the lowest numpy version pandas - # accepts... - key = np.asarray(key) - - lindexer = self.categories.get_indexer(rvalue) - - # FIXME: the following can be removed after GH7820 is fixed: - # https://github.com/pandas-dev/pandas/issues/7820 - # float categories do currently return -1 for np.nan, even if np.nan is - # included in the index -> "repair" this here - if isna(rvalue).any() and isna(self.categories).any(): - nan_pos = np.where(isna(self.categories))[0] - lindexer[lindexer == -1] = nan_pos - - lindexer = self._maybe_coerce_indexer(lindexer) - self._codes[key] = lindexer - - def _reverse_indexer(self): - """ - Compute the inverse of a categorical, returning - a dict of categories -> indexers. - - *This is an internal function* - - Returns - ------- - dict of categories -> indexers - - Example - ------- - In [1]: c = pd.Categorical(list('aabca')) - - In [2]: c - Out[2]: - [a, a, b, c, a] - Categories (3, object): [a, b, c] - - In [3]: c.categories - Out[3]: Index([u'a', u'b', u'c'], dtype='object') - - In [4]: c.codes - Out[4]: array([0, 0, 1, 2, 0], dtype=int8) - - In [5]: c._reverse_indexer() - Out[5]: {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} - - """ - categories = self.categories - r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'), - categories.size) - counts = counts.cumsum() - result = [r[counts[indexer]:counts[indexer + 1]] - for indexer in range(len(counts) - 1)] - result = dict(zip(categories, result)) - return result - - # reduction ops # - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - """ perform the reduction type operation """ - func = getattr(self, name, None) - if func is None: - msg = 'Categorical cannot perform the operation {op}' - raise TypeError(msg.format(op=name)) - return func(numeric_only=numeric_only, **kwds) - - def min(self, numeric_only=None, **kwargs): - """ The minimum value of the object. - - Only ordered `Categoricals` have a minimum! - - Raises - ------ - TypeError - If the `Categorical` is not `ordered`. - - Returns - ------- - min : the minimum of this `Categorical` - """ - self.check_for_ordered('min') - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].min(**kwargs) - else: - pointer = self._codes.min(**kwargs) - if pointer == -1: - return np.nan - else: - return self.categories[pointer] - - def max(self, numeric_only=None, **kwargs): - """ The maximum value of the object. - - Only ordered `Categoricals` have a maximum! - - Raises - ------ - TypeError - If the `Categorical` is not `ordered`. - - Returns - ------- - max : the maximum of this `Categorical` - """ - self.check_for_ordered('max') - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].max(**kwargs) - else: - pointer = self._codes.max(**kwargs) - if pointer == -1: - return np.nan - else: - return self.categories[pointer] - - def mode(self): - """ - Returns the mode(s) of the Categorical. - - Always returns `Categorical` even if only one value. - - Returns - ------- - modes : `Categorical` (sorted) - """ - - import pandas._libs.hashtable as htable - good = self._codes != -1 - values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) - result = self._constructor(values=values, categories=self.categories, - ordered=self.ordered, fastpath=True) - return result - - def unique(self): - """ - Return the ``Categorical`` which ``categories`` and ``codes`` are - unique. Unused categories are NOT returned. - - - unordered category: values and categories are sorted by appearance - order. - - ordered category: values are sorted by appearance order, categories - keeps existing order. - - Returns - ------- - unique values : ``Categorical`` - - Examples - -------- - An unordered Categorical will return categories in the - order of appearance. - - >>> pd.Categorical(list('baabc')) - [b, a, c] - Categories (3, object): [b, a, c] - - >>> pd.Categorical(list('baabc'), categories=list('abc')) - [b, a, c] - Categories (3, object): [b, a, c] - - An ordered Categorical preserves the category ordering. - - >>> pd.Categorical(list('baabc'), - ... categories=list('abc'), - ... ordered=True) - [b, a, c] - Categories (3, object): [a < b < c] - - See Also - -------- - unique - CategoricalIndex.unique - Series.unique - - """ - - # unlike np.unique, unique1d does not sort - unique_codes = unique1d(self.codes) - cat = self.copy() - - # keep nan in codes - cat._codes = unique_codes - - # exclude nan from indexer for categories - take_codes = unique_codes[unique_codes != -1] - if self.ordered: - take_codes = sorted(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) - - def equals(self, other): - """ - Returns True if categorical arrays are equal. - - Parameters - ---------- - other : `Categorical` - - Returns - ------- - are_equal : boolean - """ - if self.is_dtype_equal(other): - if self.categories.equals(other.categories): - # fastpath to avoid re-coding - other_codes = other._codes - else: - other_codes = _recode_for_categories(other.codes, - other.categories, - self.categories) - return np.array_equal(self._codes, other_codes) - return False - - def is_dtype_equal(self, other): - """ - Returns True if categoricals are the same dtype - same categories, and same ordered - - Parameters - ---------- - other : Categorical - - Returns - ------- - are_equal : boolean - """ - - try: - return hash(self.dtype) == hash(other.dtype) - except (AttributeError, TypeError): - return False - - def describe(self): - """ Describes this Categorical - - Returns - ------- - description: `DataFrame` - A dataframe with frequency and counts by category. - """ - counts = self.value_counts(dropna=False) - freqs = counts / float(counts.sum()) - - from pandas.core.reshape.concat import concat - result = concat([counts, freqs], axis=1) - result.columns = ['counts', 'freqs'] - result.index.name = 'categories' - - return result - - def repeat(self, repeats, *args, **kwargs): - """ - Repeat elements of a Categorical. - - See also - -------- - numpy.ndarray.repeat - - """ - nv.validate_repeat(args, kwargs) - codes = self._codes.repeat(repeats) - return self._constructor(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) - -# The Series.cat accessor - - -class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): - """ - Accessor object for categorical properties of the Series values. - - Be aware that assigning to `categories` is a inplace operation, while all - methods return new categorical data per default (but can be called with - `inplace=True`). - - Parameters - ---------- - data : Series or CategoricalIndex - - Examples - -------- - >>> s.cat.categories - >>> s.cat.categories = list('abc') - >>> s.cat.rename_categories(list('cab')) - >>> s.cat.reorder_categories(list('cab')) - >>> s.cat.add_categories(['d','e']) - >>> s.cat.remove_categories(['d']) - >>> s.cat.remove_unused_categories() - >>> s.cat.set_categories(list('abcde')) - >>> s.cat.as_ordered() - >>> s.cat.as_unordered() - - """ - - def __init__(self, data): - self._validate(data) - self.categorical = data.values - self.index = data.index - self.name = data.name - self._freeze() - - @staticmethod - def _validate(data): - if not is_categorical_dtype(data.dtype): - raise AttributeError("Can only use .cat accessor with a " - "'category' dtype") - - def _delegate_property_get(self, name): - return getattr(self.categorical, name) - - def _delegate_property_set(self, name, new_values): - return setattr(self.categorical, name, new_values) - - @property - def codes(self): - from pandas import Series - return Series(self.categorical.codes, index=self.index) - - def _delegate_method(self, name, *args, **kwargs): - from pandas import Series - method = getattr(self.categorical, name) - res = method(*args, **kwargs) - if res is not None: - return Series(res, index=self.index, name=self.name) - - -CategoricalAccessor._add_delegate_accessors(delegate=Categorical, - accessors=["categories", - "ordered"], - typ='property') -CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=[ - "rename_categories", "reorder_categories", "add_categories", - "remove_categories", "remove_unused_categories", "set_categories", - "as_ordered", "as_unordered"], typ='method') - -# utility routines - - -def _get_codes_for_values(values, categories): - """ - utility routine to turn values into codes given the specified categories - """ - - from pandas.core.algorithms import _get_data_algo, _hashtables - if not is_dtype_equal(values.dtype, categories.dtype): - values = _ensure_object(values) - categories = _ensure_object(categories) - - (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - (_, _), cats = _get_data_algo(categories, _hashtables) - t = hash_klass(len(cats)) - t.map_locations(cats) - return coerce_indexer_dtype(t.lookup(vals), cats) - - -def _recode_for_categories(codes, old_categories, new_categories): - """ - Convert a set of codes for to a new set of categories - - Parameters - ---------- - codes : array - old_categories, new_categories : Index - - Returns - ------- - new_codes : array - - Examples - -------- - >>> old_cat = pd.Index(['b', 'a', 'c']) - >>> new_cat = pd.Index(['a', 'b']) - >>> codes = np.array([0, 1, 1, 2]) - >>> _recode_for_categories(codes, old_cat, new_cat) - array([ 1, 0, 0, -1]) - """ - from pandas.core.algorithms import take_1d - - if len(old_categories) == 0: - # All null anyway, so just retain the nulls - return codes.copy() - indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), - new_categories) - new_codes = take_1d(indexer, codes.copy(), fill_value=-1) - return new_codes - - -def _convert_to_list_like(list_like): - if hasattr(list_like, "dtype"): - return list_like - if isinstance(list_like, list): - return list_like - if (is_sequence(list_like) or isinstance(list_like, tuple) or - isinstance(list_like, types.GeneratorType)): - return list(list_like) - elif is_scalar(list_like): - return [list_like] - else: - # is this reached? - return [list_like] - - -def _factorize_from_iterable(values): - """ - Factorize an input `values` into `categories` and `codes`. Preserves - categorical dtype in `categories`. - - *This is an internal function* - - Parameters - ---------- - values : list-like - - Returns - ------- - codes : ndarray - categories : Index - If `values` has a categorical dtype, then `categories` is - a CategoricalIndex keeping the categories and order of `values`. - """ - from pandas.core.indexes.category import CategoricalIndex - - if not is_list_like(values): - raise TypeError("Input must be list-like") - - if is_categorical(values): - if isinstance(values, (ABCCategoricalIndex, ABCSeries)): - values = values._values - categories = CategoricalIndex(values.categories, - categories=values.categories, - ordered=values.ordered) - codes = values.codes - else: - cat = Categorical(values, ordered=True) - categories = cat.categories - codes = cat.codes - return codes, categories - - -def _factorize_from_iterables(iterables): - """ - A higher-level wrapper over `_factorize_from_iterable`. - - *This is an internal function* - - Parameters - ---------- - iterables : list-like of list-likes - - Returns - ------- - codes_list : list of ndarrays - categories_list : list of Indexes - - Notes - ----- - See `_factorize_from_iterable` for more info. - """ - if len(iterables) == 0: - # For consistency, it should return a list of 2 lists. - return [[], []] - return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) +from pandas.core.arrays import Categorical # noqa diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 5e6193d673756b..3e54ce61cd5b2c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -314,7 +314,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series - from pandas.core.categorical import _recode_for_categories + from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c05eefa5706e6..7771060ad82c78 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,7 +77,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 082b6e2a8b1a00..25e44589488eed 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -47,7 +47,7 @@ DataError, SpecificationError) from pandas.core.index import (Index, MultiIndex, CategoricalIndex, _ensure_index) -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.internals import BlockManager, make_block diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ac7cb30fa823dd..9a6210db1aacbe 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -125,7 +125,7 @@ def _create_from_codes(self, codes, categories=None, ordered=None, CategoricalIndex """ - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical if categories is None: categories = self.categories if ordered is None: @@ -162,7 +162,7 @@ def _create_categorical(self, data, categories=None, ordered=None, if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: @@ -462,7 +462,7 @@ def where(self, cond, other=None): other = self._na_value values = np.where(cond, self.values, other) - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) @@ -775,7 +775,7 @@ def _delegate_method(self, name, *args, **kwargs): def _add_accessors(cls): """ add in Categorical accessor methods """ - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical CategoricalIndex._add_delegate_accessors( delegate=Categorical, accessors=["rename_categories", "reorder_categories", diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5739c8dfd8b53e..608553b9c3bf20 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1182,7 +1182,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): if len(arrays[i]) != len(arrays[i - 1]): raise ValueError('all arrays must be same length') - from pandas.core.categorical import _factorize_from_iterables + from pandas.core.arrays.categorical import _factorize_from_iterables labels, levels = _factorize_from_iterables(arrays) if names is None: @@ -1276,7 +1276,7 @@ def from_product(cls, iterables, sortorder=None, names=None): MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_tuples : Convert list of tuples to MultiIndex """ - from pandas.core.categorical import _factorize_from_iterables + from pandas.core.arrays.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product if not is_list_like(iterables): @@ -1749,7 +1749,7 @@ def _get_labels_for_sorting(self): for sorting, where we need to disambiguate that -1 is not a valid valid """ - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical def cats(label): return np.arange(np.array(label).max() + 1 if len(label) else 0, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 3c923133477df1..45618282ab4f74 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -59,7 +59,7 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.categorical import Categorical, _maybe_to_categorical +from pandas.core.arrays.categorical import Categorical, _maybe_to_categorical from pandas.core.indexes.datetimes import DatetimeIndex from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aaadf6d3ca32fa..20f4384a3d6984 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -7,8 +7,8 @@ from pandas.core.index import (_get_objs_combined_axis, _ensure_index, _get_consensus_names, _all_indexes_same) -from pandas.core.categorical import (_factorize_from_iterable, - _factorize_from_iterables) +from pandas.core.arrays.categorical import (_factorize_from_iterable, + _factorize_from_iterables) from pandas.core.internals import concatenate_block_managers from pandas.core import common as com from pandas.core.generic import NDFrame diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 28e96946819126..01445eb30a9e57 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.common import is_list_like from pandas import compat -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core.dtypes.generic import ABCMultiIndex diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f7a0fab9998d07..c8bca476c65f25 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -21,7 +21,8 @@ from pandas.core.sparse.array import SparseArray from pandas._libs.sparse import IntIndex -from pandas.core.categorical import Categorical, _factorize_from_iterable +from pandas.core.arrays import Categorical +from pandas.core.arrays.categorical import _factorize_from_iterable from pandas.core.sorting import (get_group_index, get_compressed_ids, compress_group_index, decons_obs_group_ids) diff --git a/pandas/core/series.py b/pandas/core/series.py index 73a7fe1fd89e9d..be40f65186d2df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -53,7 +53,7 @@ from pandas.core.indexing import check_bool_indexer, maybe_convert_indices from pandas.core import generic, base from pandas.core.internals import SingleBlockManager -from pandas.core.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 27252b9616a445..e550976d1deebd 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -182,7 +182,7 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): - from pandas.core.categorical import Categorical + from pandas.core.arrays import Categorical labels = [] shape = [] diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 150fccde81a60e..1a2f62442a0634 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -28,7 +28,7 @@ _ensure_index_from_sequences) from pandas.core.series import Series from pandas.core.frame import DataFrame -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core import algorithms from pandas.core.common import AbstractMethodError from pandas.io.date_converters import generic_parser diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 72543bb6f825ec..c8490167022e5a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -36,7 +36,8 @@ from pandas.errors import PerformanceWarning from pandas.core.common import _asarray_tuplesafe, _all_none from pandas.core.algorithms import match, unique -from pandas.core.categorical import Categorical, _factorize_from_iterables +from pandas.core.arrays.categorical import (Categorical, + _factorize_from_iterables) from pandas.core.internals import (BlockManager, make_block, _block2d_to_blocknd, _factor_indexer, _block_shape) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2b97b447921bbc..b409cf20e9a098 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -24,7 +24,7 @@ from pandas.compat import (lrange, lmap, lzip, text_type, string_types, range, zip, BytesIO) from pandas.core.base import StringMixin -from pandas.core.categorical import Categorical +from pandas.core.arrays import Categorical from pandas.core.dtypes.common import (is_categorical_dtype, _ensure_object, is_datetime64_dtype) from pandas.core.frame import DataFrame diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 821c7858c7a5c4..e84f09ead4f77d 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -249,3 +249,10 @@ def test_deprecation_cdaterange(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): cdate_range('2017-01-01', '2017-12-31') + + +class TestCategoricalMove(object): + + def test_categorical_move(self): + with tm.assert_produces_warning(FutureWarning): + from pandas.core.categorical import Categorical # noqa diff --git a/pandas/tests/categorical/test_api.py b/pandas/tests/categorical/test_api.py index 0af2857091b747..ad5b78b36438b4 100644 --- a/pandas/tests/categorical/test_api.py +++ b/pandas/tests/categorical/test_api.py @@ -7,7 +7,7 @@ import pandas.util.testing as tm from pandas import Categorical, CategoricalIndex, Index, Series, DataFrame -from pandas.core.categorical import _recode_for_categories +from pandas.core.arrays.categorical import _recode_for_categories from pandas.tests.categorical.common import TestCategorical diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 73cc87855acbdf..cf8698bc5ed5e1 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -511,8 +511,7 @@ def test_cat_accessor(self): def test_cat_accessor_api(self): # GH 9322 - from pandas.core.categorical import CategoricalAccessor - + from pandas.core.arrays.categorical import CategoricalAccessor assert Series.cat is CategoricalAccessor s = Series(list('aabbcde')).astype('category') assert isinstance(s.cat, CategoricalAccessor)