pandas-dev · TomAugspurger · Feb 23, 2018 · Feb 2, 2018 · Feb 3, 2018 · Feb 3, 2018
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -15,6 +15,7 @@
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
     is_object_dtype,
+    is_extension_array_dtype,
     is_categorical_dtype, is_sparse,
     is_period_dtype,
     is_numeric_dtype, is_float_dtype,
@@ -542,10 +543,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     else:
 
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_extension_array_dtype(values) or is_sparse(values):
 
             # handle Categorical and sparse,
-            result = Series(values).values.value_counts(dropna=dropna)
+            result = Series(values)._values.value_counts(dropna=dropna)
             result.name = name
             counts = result.values
 
@@ -1290,10 +1291,12 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
     """
     Specialized Cython take which sets NaN values in one pass
 
+    This dispatches to ``take`` defined on ExtensionArrays.
+
     Parameters
     ----------
-    arr : ndarray
-        Input array
+    arr : ndarray, ExtensionArray, DatetimeIndex, IntervalIndex, SparseArray
+        Input array. SparseArray is densified with ``get_values``
     indexer : ndarray
         1-D array of indices to take, subarrays corresponding to -1 value
         indicies are filed with fill_value
@@ -1313,16 +1316,23 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
         If False, indexer is assumed to contain no -1 values so no filling
         will be done.  This short-circuits computation of a mask.  Result is
         undefined if allow_fill == False and -1 is present in indexer.
+
+    Returns
+    -------
+    subarray : object
+        May be the same type as the input, or cast to an ndarray.
     """
 
+    # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs
     # dispatch to internal type takes
-    if is_categorical(arr):
-        return arr.take_nd(indexer, fill_value=fill_value,
-                           allow_fill=allow_fill)
+    if is_extension_array_dtype(arr):
+        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
     elif is_datetimetz(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
     elif is_interval_dtype(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+    elif is_sparse(arr):
+        arr = arr.get_values()
 
     if indexer is None:
         indexer = np.arange(arr.shape[axis], dtype=np.int64)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -25,14 +25,13 @@ class ExtensionArray(object):
     * isna
     * take
     * copy
-    * _formatting_values
     * _concat_same_type
 
-    Some additional methods are required to satisfy pandas' internal, private
+    Some additional methods are available to satisfy pandas' internal, private
     block API.
 
-    * _concat_same_type
     * _can_hold_na
+    * _formatting_values
 
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
@@ -53,13 +52,12 @@ class ExtensionArray(object):
     Extension arrays should be able to be constructed with instances of
     the class, i.e. ``ExtensionArray(extension_array)`` should return
     an instance, not error.
-
-    Additionally, certain methods and interfaces are required for proper
-    this array to be properly stored inside a ``DataFrame`` or ``Series``.
     """
+    _typ = 'extension'  # For pandas.core.dtypes.generic.ABCExtensionArray
     # ------------------------------------------------------------------------
     # Must be a Sequence
     # ------------------------------------------------------------------------
+
     def __getitem__(self, item):
         # type (Any) -> Any
         """Select a subset of self.
@@ -92,7 +90,41 @@ def __getitem__(self, item):
         raise AbstractMethodError(self)
 
     def __setitem__(self, key, value):
-        # type: (Any, Any) -> None
+        # type: (Union[int, np.ndarray], Any) -> None
+        """Set one or more values inplace.
+
+        Parameters
+        ----------
+        key : int or ndarray
+            When called from, e.g. ``Series.__setitem__``, ``key`` will be
+            one of
+
+            * scalar int
+            * ndarray of integers.
+            * boolean ndarray
+            * slice object
+
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            value or values to be set of ``key``.
+
+        Notes
+        -----
+        This method is not required to satisfy the interface. If an
+        ExtensionArray chooses to implement __setitem__, then some semantics
+        should be observed:
+
+        * Setting multiple values : ExtensionArrays should support setting
+          multiple values at once, ``key`` will be a sequence of integers and
+          ``value`` will be a same-length sequence.
+
+        * Broadcasting : For a sequence ``key`` and a scalar ``value``,
+          each position in ``key`` should be set to ``value``.
+
+        * Coercion : Most users will expect basic coercion to work. For
+          example, a string like ``'2018-01-01'`` is coerced to a datetime
+          when setting on a datetime64ns array. In general, if the
+        ``__init__`` method coerces that value, then so should ``__setitem__``.
+        """
         raise NotImplementedError(_not_implemented_message.format(
             type(self), '__setitem__')
         )
@@ -107,6 +139,16 @@ def __len__(self):
         # type: () -> int
         raise AbstractMethodError(self)
 
+    def __iter__(self):
+        """Iterate over elements.
+
+        This needs to be implemented so that pandas recognizes extension arrays
+        as list-like. The default implementation makes successive calls to
+        ``__getitem__``, which may be slower than necessary.
+        """
+        for i in range(len(self)):
+            yield self[i]
+
     # ------------------------------------------------------------------------
     # Required attributes
     # ------------------------------------------------------------------------
@@ -167,6 +209,25 @@ def isna(self):
         """
         raise AbstractMethodError(self)
 
+    def value_counts(self, dropna=True):
+        """Compute a histogram of the counts of non-null values.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include counts of NaN
+
+        Returns
+        -------
+        value_counts : Series
+        """
+        from pandas import value_counts
+
+        if dropna:
+            self = self[~self.isna()]
+
+        return value_counts(np.array(self))
+
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------
@@ -184,8 +245,8 @@ def take(self, indexer, allow_fill=True, fill_value=None):
             will be done. This short-circuits computation of a mask. Result is
             undefined if allow_fill == False and -1 is present in indexer.
         fill_value : any, default None
-            Fill value to replace -1 values with. By default, this uses
-            the missing value sentinel for this type, ``self._fill_value``.
+            Fill value to replace -1 values with. If applicable, this should
+            use the sentinel missing value for this type.
 
         Notes
         -----
@@ -198,17 +259,20 @@ def take(self, indexer, allow_fill=True, fill_value=None):
 
         Examples
         --------
-        Suppose the extension array somehow backed by a NumPy structured array
-        and that the underlying structured array is stored as ``self.data``.
-        Then ``take`` may be written as
+        Suppose the extension array is backed by a NumPy array stored as
+        ``self.data``. Then ``take`` may be written as
 
         .. code-block:: python
 
            def take(self, indexer, allow_fill=True, fill_value=None):
                mask = indexer == -1
                result = self.data.take(indexer)
-               result[mask] = self._fill_value
+               result[mask] = self._fill_value  # NA for this type
                return type(self)(result)
+
+        See Also
+        --------
+        numpy.take
         """
         raise AbstractMethodError(self)
 
@@ -230,17 +294,12 @@ def copy(self, deep=False):
     # ------------------------------------------------------------------------
     # Block-related methods
     # ------------------------------------------------------------------------
-    @property
-    def _fill_value(self):
-        # type: () -> Any
-        """The missing value for this type, e.g. np.nan"""
-        return None
 
     def _formatting_values(self):
         # type: () -> np.ndarray
         # At the moment, this has to be an array since we use result.dtype
         """An array of values to be printed in, e.g. the Series repr"""
-        raise AbstractMethodError(self)
+        return np.array(self)
 
     @classmethod
     def _concat_same_type(cls, to_concat):
@@ -257,6 +316,7 @@ def _concat_same_type(cls, to_concat):
         """
         raise AbstractMethodError(cls)
 
+    @property
     def _can_hold_na(self):
         # type: () -> bool
         """Whether your array can hold missing values. True by default.

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -1,4 +1,6 @@
 """Extend pandas with custom array types"""
+import inspect
+
 from pandas.errors import AbstractMethodError
 
 
@@ -106,7 +108,8 @@ def is_dtype(cls, dtype):
 
         Parameters
         ----------
-        dtype : str or dtype
+        dtype : str, object, or type
+            The dtype to check.
 
         Returns
         -------
@@ -118,12 +121,15 @@ def is_dtype(cls, dtype):
 
         1. ``cls.construct_from_string(dtype)`` is an instance
            of ``cls``.
-        2. 'dtype' is ``cls`` or a subclass of ``cls``.
+        2. ``dtype`` is an object and is an instance of ``cls``
+        3. ``dtype`` is a class and is ``cls`` or a subclass of ``cls``.
         """
         if isinstance(dtype, str):
             try:
                 return isinstance(cls.construct_from_string(dtype), cls)
             except TypeError:
                 return False
-        else:
+        elif inspect.isclass(dtype):
             return issubclass(dtype, cls)
+        else:
+            return isinstance(dtype, cls)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1708,9 +1708,9 @@ def is_extension_array_dtype(arr_or_dtype):
     """
     from pandas.core.arrays import ExtensionArray
 
-    # we want to unpack series, anything else?
     if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)):
         arr_or_dtype = arr_or_dtype._values
+
     return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
 
 

diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py
@@ -57,6 +57,8 @@ def _check(cls, inst):
 ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ",
                                        ("dateoffset",))
 ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", ))
+ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ",
+                                           ("extension",))
 
 
 class _ABCGeneric(type):

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -5,14 +5,16 @@
 from pandas._libs import lib, missing as libmissing
 from pandas._libs.tslib import NaT, iNaT
 from .generic import (ABCMultiIndex, ABCSeries,
-                      ABCIndexClass, ABCGeneric)
+                      ABCIndexClass, ABCGeneric,
+                      ABCExtensionArray)
 from .common import (is_string_dtype, is_datetimelike,
                      is_datetimelike_v_numeric, is_float_dtype,
                      is_datetime64_dtype, is_datetime64tz_dtype,
                      is_timedelta64_dtype, is_interval_dtype,
-                     is_complex_dtype, is_categorical_dtype,
+                     is_complex_dtype,
                      is_string_like_dtype, is_bool_dtype,
                      is_integer_dtype, is_dtype_equal,
+                     is_extension_array_dtype,
                      needs_i8_conversion, _ensure_object,
                      pandas_dtype,
                      is_scalar,
@@ -57,7 +59,8 @@ def _isna_new(obj):
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
-    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
+    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
+                          ABCExtensionArray)):
         return _isna_ndarraylike(obj)
     elif isinstance(obj, ABCGeneric):
         return obj._constructor(obj._data.isna(func=isna))
@@ -124,30 +127,29 @@ def _use_inf_as_na(key):
 
 
 def _isna_ndarraylike(obj):
-
     values = getattr(obj, 'values', obj)
     dtype = values.dtype
 
-    if is_string_dtype(dtype):
-        if is_categorical_dtype(values):
-            from pandas import Categorical
-            if not isinstance(values, Categorical):
-                values = values.values
-            result = values.isna()
-        elif is_interval_dtype(values):
-            from pandas import IntervalIndex
-            result = IntervalIndex(obj).isna()
+    if is_extension_array_dtype(obj):
+        if isinstance(obj, (ABCIndexClass, ABCSeries)):
+            values = obj._values
         else:
+            values = obj
+        result = values.isna()
+    elif is_interval_dtype(values):
+        # TODO(IntervalArray): remove this if block
+        from pandas import IntervalIndex
+        result = IntervalIndex(obj).isna()
+    elif is_string_dtype(dtype):
+        # Working around NumPy ticket 1542
+        shape = values.shape
 
-            # Working around NumPy ticket 1542
-            shape = values.shape
-
-            if is_string_like_dtype(dtype):
-                result = np.zeros(values.shape, dtype=bool)
-            else:
-                result = np.empty(shape, dtype=bool)
-                vec = libmissing.isnaobj(values.ravel())
-                result[...] = vec.reshape(shape)
+        if is_string_like_dtype(dtype):
+            result = np.zeros(values.shape, dtype=bool)
+        else:
+            result = np.empty(shape, dtype=bool)
+            vec = libmissing.isnaobj(values.ravel())
+            result[...] = vec.reshape(shape)
 
     elif needs_i8_conversion(obj):
         # this is the NaT pattern
@@ -406,4 +408,7 @@ def remove_na_arraylike(arr):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
     """
-    return arr[notna(lib.values_from_object(arr))]
+    if is_extension_array_dtype(arr):
+        return arr[notna(arr)]
+    else:
+        return arr[notna(lib.values_from_object(arr))]