pandas-dev · TomAugspurger · Feb 23, 2018 · Feb 2, 2018 · Feb 3, 2018 · Feb 3, 2018
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1,4 +1,6 @@
 """An interface for extending pandas with custom arrays."""
+import numpy as np
+
 from pandas.errors import AbstractMethodError
 
 _not_implemented_message = "{} does not implement {}."
@@ -24,7 +26,6 @@ class ExtensionArray(object):
     * take
     * copy
     * _formatting_values
-    * _concat_same_type
 
     Some additional methods are required to satisfy pandas' internal, private
     block API.
@@ -51,9 +52,6 @@ class ExtensionArray(object):
     Extension arrays should be able to be constructed with instances of
     the class, i.e. ``ExtensionArray(extension_array)`` should return
     an instance, not error.
-
-    Additionally, certain methods and interfaces are required for proper
-    this array to be properly stored inside a ``DataFrame`` or ``Series``.
     """
     # ------------------------------------------------------------------------
     # Must be a Sequence
@@ -177,9 +175,9 @@ def take(self, indexer, allow_fill=True, fill_value=None):
 
         Examples
         --------
-        Suppose the extension array somehow backed by a NumPy structured array
-        and that the underlying structured array is stored as ``self.data``.
-        Then ``take`` may be written as
+        Suppose the extension array somehow backed by a NumPy array and that
+        the underlying structured array is stored as ``self.data``. Then
+        ``take`` may be written as
 
         .. code-block:: python
 
@@ -219,7 +217,7 @@ def _formatting_values(self):
         # type: () -> np.ndarray
         # At the moment, this has to be an array since we use result.dtype
         """An array of values to be printed in, e.g. the Series repr"""
-        raise AbstractMethodError(self)
+        raise np.array(self)
 
     @classmethod
     def _concat_same_type(cls, to_concat):

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -13,6 +13,7 @@
                      is_complex_dtype, is_categorical_dtype,
                      is_string_like_dtype, is_bool_dtype,
                      is_integer_dtype, is_dtype_equal,
+                     is_extension_array_dtype,
                      needs_i8_conversion, _ensure_object,
                      pandas_dtype,
                      is_scalar,
@@ -52,12 +53,15 @@ def isna(obj):
 
 
 def _isna_new(obj):
+    from ..arrays import ExtensionArray
+
     if is_scalar(obj):
         return libmissing.checknull(obj)
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
-    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
+    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
+                          ExtensionArray)):
         return _isna_ndarraylike(obj)
     elif isinstance(obj, ABCGeneric):
         return obj._constructor(obj._data.isna(func=isna))
@@ -124,11 +128,14 @@ def _use_inf_as_na(key):
 
 
 def _isna_ndarraylike(obj):
+    from ..arrays import ExtensionArray
 
     values = getattr(obj, 'values', obj)
     dtype = values.dtype
 
-    if is_string_dtype(dtype):
+    if isinstance(values, ExtensionArray):
+        result = values.isna()
+    elif is_string_dtype(dtype):
         if is_categorical_dtype(values):
             from pandas import Categorical
             if not isinstance(values, Categorical):
@@ -406,4 +413,7 @@ def remove_na_arraylike(arr):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
     """
-    return arr[notna(lib.values_from_object(arr))]
+    if is_extension_array_dtype(arr):
+        return arr[notna(arr)]
+    else:
+        return arr[notna(lib.values_from_object(arr))]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -39,6 +39,7 @@
     is_categorical_dtype,
     is_object_dtype,
     is_extension_type,
+    is_extension_array_dtype,
     is_datetimetz,
     is_datetime64_any_dtype,
     is_datetime64tz_dtype,
@@ -71,7 +72,7 @@
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import Categorical, ExtensionArray
 import pandas.core.algorithms as algorithms
 from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
                            OrderedDict, raise_with_traceback)
@@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns):
             index, columns = _get_axes(len(values), 1)
             return _arrays_to_mgr([values], columns, index, columns,
                                   dtype=dtype)
-        elif is_datetimetz(values):
+        elif (is_datetimetz(values) or is_extension_array_dtype(values)):
             # GH19157
             if columns is None:
                 columns = [0]
@@ -2796,15 +2797,15 @@ def reindexer(value):
             # now align rows
             value = reindexer(value).T
 
-        elif isinstance(value, Categorical):
+        elif isinstance(value, ExtensionArray):
             value = value.copy()
 
         elif isinstance(value, Index) or is_sequence(value):
             from pandas.core.series import _sanitize_index
 
             # turn me into an ndarray
             value = _sanitize_index(value, self.index, copy=False)
-            if not isinstance(value, (np.ndarray, Index)):
+            if not isinstance(value, (np.ndarray, Index, ExtensionArray)):
                 if isinstance(value, list) and len(value) > 0:
                     value = maybe_convert_platform(value)
                 else:
@@ -2826,7 +2827,7 @@ def reindexer(value):
             value = maybe_cast_to_datetime(value, value.dtype)
 
         # return internal types directly
-        if is_extension_type(value):
+        if is_extension_type(value) or is_extension_array_dtype(value):
             return value
 
         # broadcast across multiple columns if necessary
@@ -3355,12 +3356,9 @@ class    max    type
             new_obj = self.copy()
 
         def _maybe_casted_values(index, labels=None):
-            if isinstance(index, PeriodIndex):
-                values = index.astype(object).values
-            elif isinstance(index, DatetimeIndex) and index.tz is not None:
-                values = index
-            else:
-                values = index.values
+            values = index._as_best_array()
+            # TODO: Check if nescessary...
+            if not isinstance(index, (PeriodIndex, DatetimeIndex)):
                 if values.dtype == np.object_:
                     values = lib.maybe_convert_objects(values)
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -13,6 +13,7 @@
 from pandas import compat
 
 from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCDataFrame,
     ABCMultiIndex,
@@ -1038,6 +1039,31 @@ def _to_embed(self, keep_tz=False, dtype=None):
 
         return self.values.copy()
 
+    def _as_best_array(self):
+        # type: () -> Union[ExtensionArray, ndarary]
+        """Return the underlying values as the best array type.
+
+        Indexes backed by ExtensionArrays will return the ExtensionArray.
+        Otherwise, an ndarray is returned.
+
+        Examples
+        --------
+        >>> pd.Index([0, 1, 2])._as_best_array()
+        array([0, 1, 2])
+
+        >>> pd.CategoricalIndex(['a', 'a', 'b'])._as_best_array()
+        [a, a, b]
+        Categories (2, object): [a, b]
+
+        >>> pd.IntervalIndex.from_breaks([0, 1, 2])._as_best_array()
+        IntervalArray([(0, 1], (1, 2]])
+        """
+        # We need this since CategoricalIndex.values -> Categorical
+        #                but IntervalIndex.values    -> ndarray[object]
+        # TODO: IntervalIndex defines _array_values. Would be nice to
+        # have an unambiguous way of getting an ndarray (or just use asarray?)
+        return self.values
+
     _index_shared_docs['astype'] = """
         Create an Index with values cast to dtypes. The class of a new Index
         is determined by dtype. When conversion is impossible, a ValueError
@@ -1946,6 +1972,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
 
         if is_categorical_dtype(values.dtype):
             values = np.array(values)
+
+        elif isinstance(values, ExtensionArray):
+            # This is still un-exercised within pandas, since all our
+            # extension dtypes have custom indexes.
+            values = values._formatting_values()
+
         elif is_object_dtype(values.dtype):
             values = lib.maybe_convert_objects(values, safe=1)
 
@@ -2525,7 +2557,7 @@ def get_value(self, series, key):
         # if we have something that is Index-like, then
         # use this, e.g. DatetimeIndex
         s = getattr(series, '_values', None)
-        if isinstance(s, Index) and is_scalar(key):
+        if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):
             try:
                 return s[key]
             except (IndexError, ValueError):

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -297,6 +297,9 @@ def get_values(self):
         """ return the underlying data as an ndarray """
         return self._data.get_values()
 
+    def _as_best_array(self):
+        return self._data
+
     def tolist(self):
         return self._data.tolist()
 

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -1034,6 +1034,13 @@ def _to_embed(self, keep_tz=False, dtype=None):
 
         return self.values.copy()
 
+    def _as_best_array(self):
+        # no-tz -> ndarray
+        # tz    -> DatetimeIndex (for now)
+        if self.tz is not None:
+            return self
+        return self.values
+
     def to_pydatetime(self):
         """
         Return DatetimeIndex as object ndarray of datetime.datetime objects

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -56,7 +56,10 @@
     is_null_datelike_scalar)
 import pandas.core.dtypes.concat as _concat
 
-from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex
+from pandas.core.dtypes.generic import (
+    ABCSeries,
+    ABCDatetimeIndex,
+    ABCIndexClass)
 import pandas.core.common as com
 import pandas.core.algorithms as algos
 
@@ -1854,6 +1857,20 @@ class ExtensionBlock(NonConsolidatableMixIn, Block):
 
     ExtensionArrays are limited to 1-D.
     """
+
+    def __init__(self, values, placement, ndim=None):
+        values = self._maybe_coerce_values(values)
+        super().__init__(values, placement, ndim)
+
+    def _maybe_coerce_values(self, values):
+        # Unboxes Series / Index
+        # Doesn't change any underlying dtypes.
+        if isinstance(values, ABCSeries):
+            values = values.values
+        elif isinstance(values, ABCIndexClass):
+            values = values._as_best_array()
+        return values
+
     @property
     def _holder(self):
         # For extension blocks, the holder is values-dependent.
@@ -4101,7 +4118,8 @@ def set(self, item, value, check=False):
         # FIXME: refactor, clearly separate broadcasting & zip-like assignment
         #        can prob also fix the various if tests for sparse/categorical
 
-        value_is_extension_type = is_extension_type(value)
+        value_is_extension_type = (is_extension_type(value) or
+                                   is_extension_array_dtype(value))
 
         # categorical/spares/datetimetz
         if value_is_extension_type:
@@ -4834,13 +4852,10 @@ def form_blocks(arrays, names, axes):
     if len(items_dict['ExtensionBlock']):
 
         external_blocks = []
+
         for i, _, array in items_dict['ExtensionBlock']:
-            if isinstance(array, ABCSeries):
-                array = array.values
-            # Allow our internal arrays to chose their block type.
-            block_type = getattr(array, '_block_type', ExtensionBlock)
             external_blocks.append(
-                make_block(array, klass=block_type,
+                make_block(array, klass=ExtensionBlock,
                            fastpath=True, placement=[i]))
         blocks.extend(external_blocks)
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -14,6 +14,7 @@
 import numpy.ma as ma
 
 from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
     is_bool,
@@ -173,12 +174,15 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                 raise NotImplementedError("initializing a Series from a "
                                           "MultiIndex is not supported")
             elif isinstance(data, Index):
-                # need to copy to avoid aliasing issues
                 if name is None:
                     name = data.name
 
-                data = data._to_embed(keep_tz=True, dtype=dtype)
-                copy = False
+                if dtype is not None:
+                    data = data.astype(dtype)
+
+                # need to copy to avoid aliasing issues
+                data = data._as_best_array().copy()
+
             elif isinstance(data, np.ndarray):
                 pass
             elif isinstance(data, Series):
@@ -234,6 +238,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                                        copy=copy)
                 elif copy:
                     data = data.copy()
+            elif isinstance(data, ExtensionArray):
+                if copy:
+                    data = data.copy()
+                data = SingleBlockManager(data, index, fastpath=True)
             else:
                 data = _sanitize_array(data, index, dtype, copy,
                                        raise_cast_failure=True)
@@ -2570,7 +2578,11 @@ def _reindex_indexer(self, new_index, indexer, copy):
             return self
 
         # be subclass-friendly
-        new_values = algorithms.take_1d(self.get_values(), indexer)
+        if isinstance(self.values, ExtensionArray):
+            new_values = self.values.take(indexer)
+        else:
+            new_values = algorithms.take_1d(self.get_values(), indexer)
+
         return self._constructor(new_values, index=new_index)
 
     def _needs_reindex_multi(self, axes, method, level):
@@ -3117,11 +3129,8 @@ def _sanitize_index(data, index, copy=False):
         raise ValueError('Length of values does not match length of ' 'index')
 
     if isinstance(data, ABCIndexClass) and not copy:
-        pass
-    elif isinstance(data, PeriodIndex):
-        data = data.astype(object).values
-    elif isinstance(data, DatetimeIndex):
-        data = data._to_embed(keep_tz=True)
+        data = data._as_best_array()
+
     elif isinstance(data, np.ndarray):
 
         # coerce datetimelike types
@@ -3194,11 +3203,12 @@ def _try_cast(arr, take_fast_path):
             # we will try to copy be-definition here
             subarr = _try_cast(data, True)
 
-    elif isinstance(data, Categorical):
+    elif isinstance(data, ExtensionArray):
         subarr = data
 
         if copy:
             subarr = data.copy()
+        # XXX: This is the only early return. See if it can be avoided.
         return subarr
 
     elif isinstance(data, (list, tuple)) and len(data) > 0:
@@ -3221,6 +3231,7 @@ def _try_cast(arr, take_fast_path):
         start, stop, step = get_range_parameters(data)
         arr = np.arange(start, stop, step, dtype='int64')
         subarr = _try_cast(arr, False)
+
     else:
         subarr = _try_cast(data, False)
 

diff --git a/pandas/tests/extension_arrays/__init__.py b/pandas/tests/extension_arrays/__init__.py