pandas-dev · TomAugspurger · Feb 23, 2018 · Feb 2, 2018 · Feb 3, 2018 · Feb 3, 2018
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -15,6 +15,7 @@
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
     is_object_dtype,
+    is_extension_array_dtype,
     is_categorical_dtype, is_sparse,
     is_period_dtype,
     is_numeric_dtype, is_float_dtype,
@@ -542,7 +543,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     else:
 
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_extension_array_dtype(values) or is_sparse(values):
 
             # handle Categorical and sparse,
             result = Series(values).values.value_counts(dropna=dropna)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1,4 +1,6 @@
 """An interface for extending pandas with custom arrays."""
+import numpy as np
+
 from pandas.errors import AbstractMethodError
 
 _not_implemented_message = "{} does not implement {}."
@@ -23,14 +25,14 @@ class ExtensionArray(object):
     * isna
     * take
     * copy
-    * _formatting_values
     * _concat_same_type
 
     Some additional methods are required to satisfy pandas' internal, private
     block API.
 
-    * _concat_same_type
     * _can_hold_na
+    * _formatting_values
+    * _fill_value
 
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
@@ -51,9 +53,6 @@ class ExtensionArray(object):
     Extension arrays should be able to be constructed with instances of
     the class, i.e. ``ExtensionArray(extension_array)`` should return
     an instance, not error.
-
-    Additionally, certain methods and interfaces are required for proper
-    this array to be properly stored inside a ``DataFrame`` or ``Series``.
     """
     # ------------------------------------------------------------------------
     # Must be a Sequence
@@ -105,6 +104,16 @@ def __len__(self):
         # type: () -> int
         raise AbstractMethodError(self)
 
+    def __iter__(self):
+        """Iterate over elements.
+
+        This needs to be implemented so that pandas recognizes extension arrays
+        as list-like. The default implementation makes successive calls to
+        ``__getitem__``, which may be slower than necessary.
+        """
+        for i in range(len(self)):
+            yield self[i]
+
     # ------------------------------------------------------------------------
     # Required attributes
     # ------------------------------------------------------------------------
@@ -177,9 +186,9 @@ def take(self, indexer, allow_fill=True, fill_value=None):
 
         Examples
         --------
-        Suppose the extension array somehow backed by a NumPy structured array
-        and that the underlying structured array is stored as ``self.data``.
-        Then ``take`` may be written as
+        Suppose the extension array somehow backed by a NumPy array and that
+        the underlying structured array is stored as ``self.data``. Then
+        ``take`` may be written as
 
         .. code-block:: python
 
@@ -219,7 +228,7 @@ def _formatting_values(self):
         # type: () -> np.ndarray
         # At the moment, this has to be an array since we use result.dtype
         """An array of values to be printed in, e.g. the Series repr"""
-        raise AbstractMethodError(self)
+        return np.array(self)
 
     @classmethod
     def _concat_same_type(cls, to_concat):
@@ -236,6 +245,7 @@ def _concat_same_type(cls, to_concat):
         """
         raise AbstractMethodError(cls)
 
+    @property
     def _can_hold_na(self):
         # type: () -> bool
         """Whether your array can hold missing values. True by default.
@@ -245,3 +255,11 @@ def _can_hold_na(self):
         Setting this to false will optimize some operations like fillna.
         """
         return True
+
+    def value_counts(self, dropna=True):
+        from pandas import value_counts
+
+        if dropna:
+            self = self[~self.isna()]
+
+        return value_counts(np.array(self))
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -1,4 +1,6 @@
 """Extend pandas with custom array types"""
+import inspect
+
 from pandas.errors import AbstractMethodError
 
 
@@ -106,7 +108,8 @@ def is_dtype(cls, dtype):
 
         Parameters
         ----------
-        dtype : str or dtype
+        dtype : str, object, or type
+            The dtype to check.
 
         Returns
         -------
@@ -118,12 +121,15 @@ def is_dtype(cls, dtype):
 
         1. ``cls.construct_from_string(dtype)`` is an instance
            of ``cls``.
-        2. 'dtype' is ``cls`` or a subclass of ``cls``.
+        2. ``dtype`` is an object and is an instance of ``cls``
+        3. 'dtype' is a class and is ``cls`` or a subclass of ``cls``.
         """
         if isinstance(dtype, str):
             try:
                 return isinstance(cls.construct_from_string(dtype), cls)
             except TypeError:
                 return False
-        else:
+        elif inspect.isclass(dtype):
             return issubclass(dtype, cls)
+        else:
+            return isinstance(dtype, cls)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1708,9 +1708,10 @@ def is_extension_array_dtype(arr_or_dtype):
     """
     from pandas.core.arrays import ExtensionArray
 
-    # we want to unpack series, anything else?
     if isinstance(arr_or_dtype, ABCSeries):
         arr_or_dtype = arr_or_dtype._values
+    elif isinstance(arr_or_dtype, ABCIndexClass):
+        arr_or_dtype = arr_or_dtype._as_best_array()
     return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
 
 

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -10,9 +10,10 @@
                      is_datetimelike_v_numeric, is_float_dtype,
                      is_datetime64_dtype, is_datetime64tz_dtype,
                      is_timedelta64_dtype, is_interval_dtype,
-                     is_complex_dtype, is_categorical_dtype,
+                     is_complex_dtype,
                      is_string_like_dtype, is_bool_dtype,
                      is_integer_dtype, is_dtype_equal,
+                     is_extension_array_dtype,
                      needs_i8_conversion, _ensure_object,
                      pandas_dtype,
                      is_scalar,
@@ -52,12 +53,15 @@ def isna(obj):
 
 
 def _isna_new(obj):
+    from ..arrays import ExtensionArray
+
     if is_scalar(obj):
         return libmissing.checknull(obj)
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
-    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
+    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
+                          ExtensionArray)):
         return _isna_ndarraylike(obj)
     elif isinstance(obj, ABCGeneric):
         return obj._constructor(obj._data.isna(func=isna))
@@ -124,17 +128,20 @@ def _use_inf_as_na(key):
 
 
 def _isna_ndarraylike(obj):
-
     values = getattr(obj, 'values', obj)
     dtype = values.dtype
 
-    if is_string_dtype(dtype):
-        if is_categorical_dtype(values):
-            from pandas import Categorical
-            if not isinstance(values, Categorical):
-                values = values.values
-            result = values.isna()
-        elif is_interval_dtype(values):
+    if is_extension_array_dtype(obj):
+        if isinstance(obj, ABCIndexClass):
+            values = obj._as_best_array()
+        elif isinstance(obj, ABCSeries):
+            values = obj._values
+        else:
+            values = obj
+        result = values.isna()
+    elif is_string_dtype(dtype):
+        if is_interval_dtype(values):
+            # TODO(IntervalArray): remove this if block
             from pandas import IntervalIndex
             result = IntervalIndex(obj).isna()
         else:
@@ -406,4 +413,7 @@ def remove_na_arraylike(arr):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
     """
-    return arr[notna(lib.values_from_object(arr))]
+    if is_extension_array_dtype(arr):
+        return arr[notna(arr)]
+    else:
+        return arr[notna(lib.values_from_object(arr))]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -39,6 +39,7 @@
     is_categorical_dtype,
     is_object_dtype,
     is_extension_type,
+    is_extension_array_dtype,
     is_datetimetz,
     is_datetime64_any_dtype,
     is_datetime64tz_dtype,
@@ -71,7 +72,7 @@
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import Categorical, ExtensionArray
 import pandas.core.algorithms as algorithms
 from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
                            OrderedDict, raise_with_traceback)
@@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns):
             index, columns = _get_axes(len(values), 1)
             return _arrays_to_mgr([values], columns, index, columns,
                                   dtype=dtype)
-        elif is_datetimetz(values):
+        elif (is_datetimetz(values) or is_extension_array_dtype(values)):
             # GH19157
             if columns is None:
                 columns = [0]
@@ -2796,15 +2797,15 @@ def reindexer(value):
             # now align rows
             value = reindexer(value).T
 
-        elif isinstance(value, Categorical):
+        elif isinstance(value, ExtensionArray):
             value = value.copy()
 
         elif isinstance(value, Index) or is_sequence(value):
             from pandas.core.series import _sanitize_index
 
             # turn me into an ndarray
             value = _sanitize_index(value, self.index, copy=False)
-            if not isinstance(value, (np.ndarray, Index)):
+            if not isinstance(value, (np.ndarray, Index, ExtensionArray)):
                 if isinstance(value, list) and len(value) > 0:
                     value = maybe_convert_platform(value)
                 else:
@@ -2826,7 +2827,7 @@ def reindexer(value):
             value = maybe_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
-        if is_extension_type(value):
+        if is_extension_type(value) or is_extension_array_dtype(value):
             return value
 
         # broadcast across multiple columns if necessary
@@ -3355,12 +3356,9 @@ class    max    type
             new_obj = self.copy()
 
         def _maybe_casted_values(index, labels=None):
-            if isinstance(index, PeriodIndex):
-                values = index.astype(object).values
-            elif isinstance(index, DatetimeIndex) and index.tz is not None:
-                values = index
-            else:
-                values = index.values
+            values = index._as_best_array()
+            # TODO: Check if nescessary...
+            if not isinstance(index, (PeriodIndex, DatetimeIndex)):
                 if values.dtype == np.object_:
                     values = lib.maybe_convert_objects(values)
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -13,6 +13,7 @@
 from pandas import compat
 
 from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCDataFrame,
     ABCMultiIndex,
@@ -1038,6 +1039,31 @@ def _to_embed(self, keep_tz=False, dtype=None):
 
         return self.values.copy()
 
+    def _as_best_array(self):
+        # type: () -> Union[ExtensionArray, ndarary]
+        """Return the underlying values as the best array type.
+
+        Indexes backed by ExtensionArrays will return the ExtensionArray.
+        Otherwise, an ndarray is returned.
+
+        Examples
+        --------
+        >>> pd.Index([0, 1, 2])._as_best_array()
+        array([0, 1, 2])
+
+        >>> pd.CategoricalIndex(['a', 'a', 'b'])._as_best_array()
+        [a, a, b]
+        Categories (2, object): [a, b]
+
+        >>> pd.IntervalIndex.from_breaks([0, 1, 2])._as_best_array()
+        IntervalArray([(0, 1], (1, 2]])
+        """
+        # We need this since CategoricalIndex.values -> Categorical
+        #                but IntervalIndex.values    -> ndarray[object]
+        # TODO: IntervalIndex defines _array_values. Would be nice to
+        # have an unambiguous way of getting an ndarray (or just use asarray?)
+        return self.values
+
     _index_shared_docs['astype'] = """
         Create an Index with values cast to dtypes. The class of a new Index
         is determined by dtype. When conversion is impossible, a ValueError
@@ -1946,6 +1972,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
 
         if is_categorical_dtype(values.dtype):
             values = np.array(values)
+
+        elif isinstance(values, ExtensionArray):
+            # This is still un-exercised within pandas, since all our
+            # extension dtypes have custom indexes.
+            values = values._formatting_values()
+
         elif is_object_dtype(values.dtype):
             values = lib.maybe_convert_objects(values, safe=1)
 
@@ -2525,7 +2557,7 @@ def get_value(self, series, key):
         # if we have something that is Index-like, then
         # use this, e.g. DatetimeIndex
         s = getattr(series, '_values', None)
-        if isinstance(s, Index) and is_scalar(key):
+        if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):
             try:
                 return s[key]
             except (IndexError, ValueError):

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -297,6 +297,9 @@ def get_values(self):
         """ return the underlying data as an ndarray """
         return self._data.get_values()
 
+    def _as_best_array(self):
+        return self._data
+
     def tolist(self):
         return self._data.tolist()
 

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -1034,6 +1034,13 @@ def _to_embed(self, keep_tz=False, dtype=None):
 
         return self.values.copy()
 
+    def _as_best_array(self):
+        # no-tz -> ndarray
+        # tz    -> DatetimeIndex (for now)
+        if self.tz is not None:
+            return self
+        return self.values
+
     def to_pydatetime(self):
         """
         Return DatetimeIndex as object ndarray of datetime.datetime objects