pandas-dev · TomAugspurger · Feb 23, 2018 · Feb 2, 2018 · Feb 3, 2018 · Feb 3, 2018
diff --git a/circle.yml b/circle.yml
@@ -2,6 +2,7 @@ machine:
   environment:
     # these are globally set
     MINICONDA_DIR: /home/ubuntu/miniconda3
+    PANDAS_TESTING_MODE: deprecate
 
 
 database:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -15,6 +15,7 @@
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
     is_object_dtype,
+    is_extension_array_dtype,
     is_categorical_dtype, is_sparse,
     is_period_dtype,
     is_numeric_dtype, is_float_dtype,
@@ -542,7 +543,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     else:
 
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_extension_array_dtype(values) or is_sparse(values):
 
             # handle Categorical and sparse,
             result = Series(values).values.value_counts(dropna=dropna)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -25,14 +25,14 @@ class ExtensionArray(object):
     * isna
     * take
     * copy
-    * _formatting_values
     * _concat_same_type
 
     Some additional methods are required to satisfy pandas' internal, private
     block API.
 
-    * _concat_same_type
     * _can_hold_na
+    * _formatting_values
+    * _fill_value
 
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
@@ -53,9 +53,6 @@ class ExtensionArray(object):
     Extension arrays should be able to be constructed with instances of
     the class, i.e. ``ExtensionArray(extension_array)`` should return
     an instance, not error.
-
-    Additionally, certain methods and interfaces are required for proper
-    this array to be properly stored inside a ``DataFrame`` or ``Series``.
     """
     # ------------------------------------------------------------------------
     # Must be a Sequence
@@ -92,7 +89,37 @@ def __getitem__(self, item):
         raise AbstractMethodError(self)
 
     def __setitem__(self, key, value):
-        # type: (Any, Any) -> None
+        # type: (Union[int, np.ndarray], Any) -> None
+        """Set one or more values inplace.
+
+        Parameters
+        ----------
+        key : int or ndarray
+            When called from, e.g. ``Series.__setitem__``, ``key`` will
+            always be an ndarray of integers.
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            ExtensionArrays may
+
+        Notes
+        -----
+        This method is not required to satisfy the interface. If an
+        ExtensionArray chooses to implement __setitem__, then some semantics
+        should be observed.
+
+        * Setting multiple values : ExtensionArrays should support setting
+          multiple values at once, ``key`` will be a sequence of integers.
+
+        * Broadcasting : For a sequence ``key`` and a scalar ``value``,
+          each position in ``key`` should be set to ``value``.
+
+        * Coercion : Most users will expect basic coercion to work. For
+          example, a string like ``'2018-01-01'`` is coerced to a datetime
+          when setting on a datetime64ns array. In general, if the
+        ``__init__`` method coerces that value, then so should ``__setitem__``.
+
+        When called from, e.g. ``Series.__setitem__``, ``key`` will always
+        be an ndarray of positions.
+        """
         raise NotImplementedError(_not_implemented_message.format(
             type(self), '__setitem__')
         )
@@ -107,6 +134,16 @@ def __len__(self):
         # type: () -> int
         raise AbstractMethodError(self)
 
+    def __iter__(self):
+        """Iterate over elements.
+
+        This needs to be implemented so that pandas recognizes extension arrays
+        as list-like. The default implementation makes successive calls to
+        ``__getitem__``, which may be slower than necessary.
+        """
+        for i in range(len(self)):
+            yield self[i]
+
     # ------------------------------------------------------------------------
     # Required attributes
     # ------------------------------------------------------------------------
@@ -167,6 +204,25 @@ def isna(self):
         """
         raise AbstractMethodError(self)
 
+    def value_counts(self, dropna=True):
+        """Compute a histogram of the counts of non-null values.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include counts of NaN
+
+        Returns
+        -------
+        value_counts : Series
+        """
+        from pandas import value_counts
+
+        if dropna:
+            self = self[~self.isna()]
+
+        return value_counts(np.array(self))
+
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------
@@ -198,9 +254,8 @@ def take(self, indexer, allow_fill=True, fill_value=None):
 
         Examples
         --------
-        Suppose the extension array somehow backed by a NumPy structured array
-        and that the underlying structured array is stored as ``self.data``.
-        Then ``take`` may be written as
+        Suppose the extension array is backed by a NumPy array stored as
+        ``self.data``. Then ``take`` may be written as
 
         .. code-block:: python
 
@@ -209,6 +264,10 @@ def take(self, indexer, allow_fill=True, fill_value=None):
                result = self.data.take(indexer)
                result[mask] = self._fill_value
                return type(self)(result)
+
+        See Also
+        --------
+        numpy.take
         """
         raise AbstractMethodError(self)
 
@@ -240,7 +299,7 @@ def _formatting_values(self):
         # type: () -> np.ndarray
         # At the moment, this has to be an array since we use result.dtype
         """An array of values to be printed in, e.g. the Series repr"""
-        raise AbstractMethodError(self)
+        return np.array(self)
 
     @classmethod
     def _concat_same_type(cls, to_concat):
@@ -257,6 +316,7 @@ def _concat_same_type(cls, to_concat):
         """
         raise AbstractMethodError(cls)
 
+    @property
     def _can_hold_na(self):
         # type: () -> bool
         """Whether your array can hold missing values. True by default.

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2141,6 +2141,10 @@ def repeat(self, repeats, *args, **kwargs):
     def _can_hold_na(self):
         return True
 
+    @property
+    def _fill_value(self):
+        return np.nan
+
     @classmethod
     def _concat_same_type(self, to_concat):
         from pandas.core.dtypes.concat import _concat_categorical

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -1,4 +1,6 @@
 """Extend pandas with custom array types"""
+import inspect
+
 from pandas.errors import AbstractMethodError
 
 
@@ -106,7 +108,8 @@ def is_dtype(cls, dtype):
 
         Parameters
         ----------
-        dtype : str or dtype
+        dtype : str, object, or type
+            The dtype to check.
 
         Returns
         -------
@@ -118,12 +121,15 @@ def is_dtype(cls, dtype):
 
         1. ``cls.construct_from_string(dtype)`` is an instance
            of ``cls``.
-        2. 'dtype' is ``cls`` or a subclass of ``cls``.
+        2. ``dtype`` is an object and is an instance of ``cls``
+        3. 'dtype' is a class and is ``cls`` or a subclass of ``cls``.
         """
         if isinstance(dtype, str):
             try:
                 return isinstance(cls.construct_from_string(dtype), cls)
             except TypeError:
                 return False
-        else:
+        elif inspect.isclass(dtype):
             return issubclass(dtype, cls)
+        else:
+            return isinstance(dtype, cls)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1708,9 +1708,9 @@ def is_extension_array_dtype(arr_or_dtype):
     """
     from pandas.core.arrays import ExtensionArray
 
-    # we want to unpack series, anything else?
     if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)):
         arr_or_dtype = arr_or_dtype._values
+
     return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
 
 

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -10,9 +10,10 @@
                      is_datetimelike_v_numeric, is_float_dtype,
                      is_datetime64_dtype, is_datetime64tz_dtype,
                      is_timedelta64_dtype, is_interval_dtype,
-                     is_complex_dtype, is_categorical_dtype,
+                     is_complex_dtype,
                      is_string_like_dtype, is_bool_dtype,
                      is_integer_dtype, is_dtype_equal,
+                     is_extension_array_dtype,
                      needs_i8_conversion, _ensure_object,
                      pandas_dtype,
                      is_scalar,
@@ -52,12 +53,15 @@ def isna(obj):
 
 
 def _isna_new(obj):
+    from ..arrays import ExtensionArray
+
     if is_scalar(obj):
         return libmissing.checknull(obj)
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
-    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
+    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
+                          ExtensionArray)):
         return _isna_ndarraylike(obj)
     elif isinstance(obj, ABCGeneric):
         return obj._constructor(obj._data.isna(func=isna))
@@ -124,17 +128,18 @@ def _use_inf_as_na(key):
 
 
 def _isna_ndarraylike(obj):
-
     values = getattr(obj, 'values', obj)
     dtype = values.dtype
 
-    if is_string_dtype(dtype):
-        if is_categorical_dtype(values):
-            from pandas import Categorical
-            if not isinstance(values, Categorical):
-                values = values.values
-            result = values.isna()
-        elif is_interval_dtype(values):
+    if is_extension_array_dtype(obj):
+        if isinstance(obj, (ABCIndexClass, ABCSeries)):
+            values = obj._values
+        else:
+            values = obj
+        result = values.isna()
+    elif is_string_dtype(dtype):
+        if is_interval_dtype(values):
+            # TODO(IntervalArray): remove this if block
             from pandas import IntervalIndex
             result = IntervalIndex(obj).isna()
         else:
@@ -406,4 +411,7 @@ def remove_na_arraylike(arr):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
     """
-    return arr[notna(lib.values_from_object(arr))]
+    if is_extension_array_dtype(arr):
+        return arr[notna(arr)]
+    else:
+        return arr[notna(lib.values_from_object(arr))]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -39,6 +39,7 @@
     is_categorical_dtype,
     is_object_dtype,
     is_extension_type,
+    is_extension_array_dtype,
     is_datetimetz,
     is_datetime64_any_dtype,
     is_datetime64tz_dtype,
@@ -71,7 +72,7 @@
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import Categorical, ExtensionArray
 import pandas.core.algorithms as algorithms
 from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
                            OrderedDict, raise_with_traceback)
@@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns):
             index, columns = _get_axes(len(values), 1)
             return _arrays_to_mgr([values], columns, index, columns,
                                   dtype=dtype)
-        elif is_datetimetz(values):
+        elif (is_datetimetz(values) or is_extension_array_dtype(values)):
             # GH19157
             if columns is None:
                 columns = [0]
@@ -2819,15 +2820,15 @@ def reindexer(value):
             # now align rows
             value = reindexer(value).T
 
-        elif isinstance(value, Categorical):
+        elif isinstance(value, ExtensionArray):
             value = value.copy()
 
         elif isinstance(value, Index) or is_sequence(value):
             from pandas.core.series import _sanitize_index
 
             # turn me into an ndarray
             value = _sanitize_index(value, self.index, copy=False)
-            if not isinstance(value, (np.ndarray, Index)):
+            if not isinstance(value, (np.ndarray, Index, ExtensionArray)):
                 if isinstance(value, list) and len(value) > 0:
                     value = maybe_convert_platform(value)
                 else:
@@ -2849,7 +2850,7 @@ def reindexer(value):
             value = maybe_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
-        if is_extension_type(value):
+        if is_extension_type(value) or is_extension_array_dtype(value):
             return value
 
         # broadcast across multiple columns if necessary
@@ -3386,12 +3387,8 @@ class    max    type
             new_obj = self.copy()
 
         def _maybe_casted_values(index, labels=None):
-            if isinstance(index, PeriodIndex):
-                values = index.astype(object).values
-            elif isinstance(index, DatetimeIndex) and index.tz is not None:
-                values = index
-            else:
-                values = index.values
+            values = index._values
+            if not isinstance(index, (PeriodIndex, DatetimeIndex)):
                 if values.dtype == np.object_:
                     values = lib.maybe_convert_objects(values)
 
@@ -5640,7 +5637,7 @@ def count(self, axis=0, level=None, numeric_only=False):
         if len(frame._get_axis(axis)) == 0:
             result = Series(0, index=frame._get_agg_axis(axis))
         else:
-            if frame._is_mixed_type:
+            if frame._is_mixed_type or frame._data.any_extension_types:
                 result = notna(frame).sum(axis=axis)
             else:
                 counts = notna(frame.values).sum(axis=axis)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -13,6 +13,7 @@
 from pandas import compat
 
 from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCDataFrame,
     ABCMultiIndex,
@@ -2002,6 +2003,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
 
         if is_categorical_dtype(values.dtype):
             values = np.array(values)
+
         elif is_object_dtype(values.dtype):
             values = lib.maybe_convert_objects(values, safe=1)
 
@@ -2601,7 +2603,7 @@ def get_value(self, series, key):
         # if we have something that is Index-like, then
         # use this, e.g. DatetimeIndex
         s = getattr(series, '_values', None)
-        if isinstance(s, Index) and is_scalar(key):
+        if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):
             try:
                 return s[key]
             except (IndexError, ValueError):