pandas-dev · TomAugspurger · Dec 18, 2017 · Jan 11, 2018 · Jan 11, 2018 · Jan 12, 2018
diff --git a/.gitignore b/.gitignore
@@ -108,3 +108,4 @@ doc/tmp.sv
 doc/source/styled.xlsx
 doc/source/templates/
 env/
+.mypy_cache
diff --git a/doc/source/developer.rst b/doc/source/developer.rst
@@ -140,3 +140,64 @@ As an example of fully-formed metadata:
          'metadata': None}
     ],
     'pandas_version': '0.20.0'}
+
+.. _developer.custom-array-types:
+
+Custom Array Types
+------------------
+
+.. versionadded:: 0.23.0
+
+.. warning::
+   Support for custom array types is experimental.
+
+Sometimes the NumPy type system isn't rich enough for your needs. Pandas has
+made a few extensions internally (e.g. ``Categorical``). While this has worked
+well for pandas, not all custom data types belong in pandas itself.
+
+Pandas defines an interface for custom arrays. Arrays implementing this
+interface will be stored correctly in ``Series`` or ``DataFrame``. The ABCs
+that must be implemented are
+
+1. :class:`ExtensionDtype` A class describing your data type itself. This is
+   similar to a ``numpy.dtype``.
+2. :class:`ExtensionArray`: A container for your data.
+
+Throughout this document, we'll use the example of storing IPv6 addresses. An
+IPv6 address is 128 bits, so NumPy doesn't have a native data type for it. We'll
+model it as a structured array with two ``uint64`` fields, which together
+represent the 128-bit integer that is the IP Address.
+
+Extension Dtype
+'''''''''''''''
+
+This class should describe your data type. The most important fields are
+``name`` and ``base``:
+
+.. code-block:: python
+
+   class IPv6Type(ExtensionDtype):
+       name = 'IPv6'
+       base = np.dtype([('hi', '>u8'), ('lo', '>u8')])
+       type = IPTypeType
+       kind = 'O'
+       fill_value = np.array([(0, 0)], dtype=base)
+
+``base`` describe the underlying storage of individual items in your array.
+TODO: is this true? Or does ``.base`` refer to the original memory this
+is a view on? Different meanings for ``np.dtype.base`` vs. ``np.ndarray.base``?
+
+In our IPAddress case, we're using a NumPy structured array with two fields.
+
+Extension Array
+'''''''''''''''
+
+This is the actual array container for your data, similar to a ``Categorical``,
+and requires the most work to implement correctly. *pandas makes no assumptions
+about how you store the data*. You're free to use NumPy arrays or PyArrow
+arrays, or even just Python lists. That said, several of the methods required by
+the interface expect NumPy arrays as the return value.
+
+* ``dtype``: Should be an *instance* of your custom ``ExtensionType``
+* ``formtting_values(self)``: Used for printing Series and DataFrame
+* ``concat_same_type(concat)``: Used in :func:`pd.concat`
diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py
@@ -1 +1,2 @@
 """ public toolkit API """
+from . import types, extensions  # noqa
diff --git a/pandas/api/extensions.py b/pandas/api/extensions.py
@@ -0,0 +1,4 @@
+from pandas.core.extensions import (  # noqa
+    ExtensionArray,
+    ExtensionDtype,
+)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -6,6 +6,7 @@
 from warnings import warn, catch_warnings
 import numpy as np
 
+from pandas.core.extensions import ExtensionArray
 from pandas.core.dtypes.cast import (
     maybe_promote, construct_1d_object_array_from_listlike)
 from pandas.core.dtypes.generic import (
@@ -22,7 +23,7 @@
     is_categorical, is_datetimetz,
     is_datetime64_any_dtype, is_datetime64tz_dtype,
     is_timedelta64_dtype, is_interval_dtype,
-    is_scalar, is_list_like,
+    is_scalar, is_list_like, is_extension_type,
     _ensure_platform_int, _ensure_object,
     _ensure_float64, _ensure_uint64,
     _ensure_int64)
@@ -542,9 +543,12 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     else:
 
-        if is_categorical_dtype(values) or is_sparse(values):
-
-            # handle Categorical and sparse,
+        if (is_extension_type(values) and not
+                is_datetime64tz_dtype(values)):
+            # Need the not is_datetime64tz_dtype since it's actually
+            # an ndarray. It doesn't have a `.values.value_counts`.
+            # Perhaps we need a new is_extension_type method that
+            # distinguishes these...
             result = Series(values).values.value_counts(dropna=dropna)
             result.name = name
             counts = result.values
@@ -1323,6 +1327,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
     elif is_interval_dtype(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+    elif isinstance(arr, ExtensionArray):
+        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
 
     if indexer is None:
         indexer = np.arange(arr.shape[axis], dtype=np.int64)

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -880,7 +880,7 @@ def _map_values(self, mapper, na_action=None):
         if isinstance(mapper, ABCSeries):
             # Since values were input this means we came from either
             # a dict or a series and mapper should be an index
-            if is_extension_type(self.dtype):
+            if is_extension_type(self):
                 values = self._values
             else:
                 values = self.values
@@ -891,7 +891,8 @@ def _map_values(self, mapper, na_action=None):
             return new_values
 
         # we must convert to python types
-        if is_extension_type(self.dtype):
+        # TODO: is map part of the interface?
+        if is_extension_type(self) and hasattr(self._values, 'map'):
             values = self._values
             if na_action is not None:
                 raise NotImplementedError

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -43,6 +43,7 @@
 from pandas.io.formats.terminal import get_terminal_size
 from pandas.util._validators import validate_bool_kwarg
 from pandas.core.config import get_option
+from pandas.core.extensions import ExtensionArray
 
 
 def _cat_compare_op(op):
@@ -409,6 +410,11 @@ def dtype(self):
         """The :class:`~pandas.api.types.CategoricalDtype` for this instance"""
         return self._dtype
 
+    @property
+    def _block_type(self):
+        from pandas.core.internals import CategoricalBlock
+        return CategoricalBlock
+
     @property
     def _constructor(self):
         return Categorical
@@ -2131,6 +2137,15 @@ def repeat(self, repeats, *args, **kwargs):
         return self._constructor(values=codes, categories=self.categories,
                                  ordered=self.ordered, fastpath=True)
 
+
+# TODO: Categorical does not currently implement
+# - concat_same_type
+# - can_hold_na
+# We don't need to implement these, since they're just for
+# Block things, and we only use CategoricalBlocks for categoricals.
+# We could move that logic from CategoricalBlock to Categorical,
+# but holding off for now.
+ExtensionArray.register(Categorical)
 # The Series.cat accessor
 
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1089,6 +1089,7 @@ def find_common_type(types):
     numpy.find_common_type
 
     """
+    # TODO: Make part of the interface?
 
     if len(types) == 0:
         raise ValueError('no types given')
@@ -1100,7 +1101,8 @@ def find_common_type(types):
     if all(is_dtype_equal(first, t) for t in types[1:]):
         return first
 
-    if any(isinstance(t, ExtensionDtype) for t in types):
+    # TODO: Period is an ExtensionDtype
+    if any(isinstance(t, (ExtensionDtype, PeriodDtype)) for t in types):
         return np.object
 
     # take lowest unit

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -568,7 +568,6 @@ def is_string_dtype(arr_or_dtype):
     """
 
     # TODO: gh-15585: consider making the checks stricter.
-
     if arr_or_dtype is None:
         return False
     try:
@@ -1624,11 +1623,13 @@ def is_bool_dtype(arr_or_dtype):
 
 def is_extension_type(arr):
     """
-    Check whether an array-like is of a pandas extension class instance.
+    Check whether an array-like is a pandas extension class instance.
 
     Extension classes include categoricals, pandas sparse objects (i.e.
     classes represented within the pandas library and not ones external
-    to it like scipy sparse matrices), and datetime-like arrays.
+    to it like scipy sparse matrices), and datetime-like arrays with
+    timezones, or any third-party objects satisfying the pandas array
+    interface.
 
     Parameters
     ----------
@@ -1646,39 +1647,44 @@ def is_extension_type(arr):
     False
     >>> is_extension_type(np.array([1, 2, 3]))
     False
-    >>>
+
+    Categoricals
     >>> cat = pd.Categorical([1, 2, 3])
-    >>>
     >>> is_extension_type(cat)
     True
     >>> is_extension_type(pd.Series(cat))
-    True
+
+    pandas' Sparse arrays
     >>> is_extension_type(pd.SparseArray([1, 2, 3]))
     True
     >>> is_extension_type(pd.SparseSeries([1, 2, 3]))
     True
-    >>>
     >>> from scipy.sparse import bsr_matrix
     >>> is_extension_type(bsr_matrix([1, 2, 3]))
     False
     >>> is_extension_type(pd.DatetimeIndex([1, 2, 3]))
     False
+
+    pandas' datetime with timezone
     >>> is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
     True
-    >>>
     >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
     >>> s = pd.Series([], dtype=dtype)
     >>> is_extension_type(s)
     True
     """
-
-    if is_categorical(arr):
-        return True
-    elif is_sparse(arr):
-        return True
-    elif is_datetimetz(arr):
-        return True
-    return False
+    # XXX: we have many places where we call this with a `.dtype`,
+    # instead of a type. Think about supporting that too...
+    from pandas.core.extensions import ExtensionArray, ExtensionDtype
+    return (isinstance(arr, ExtensionArray) or
+            isinstance(getattr(arr, 'values', None), ExtensionArray) or
+            # XXX: I don't like this getattr('dtype'), but I think it's
+            # necessary since DatetimeIndex().values of a datetime w/ tz
+            # is just a regular numpy array, and not an instance of
+            # ExtensionArray. I think that's since
+            # datetime (without tz) is *not* an extension type, but
+            # datetime[tz] *is* an extension type.
+            isinstance(getattr(arr, 'dtype', None), ExtensionDtype))
 
 
 def is_complex_dtype(arr_or_dtype):

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -4,27 +4,11 @@
 import numpy as np
 from pandas import compat
 from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex
+from pandas.core.extensions import ExtensionDtype
 
 
-class ExtensionDtype(object):
-    """
-    A np.dtype duck-typed class, suitable for holding a custom dtype.
-
-    THIS IS NOT A REAL NUMPY DTYPE
-    """
-    name = None
-    names = None
-    type = None
-    subdtype = None
-    kind = None
-    str = None
-    num = 100
-    shape = tuple()
-    itemsize = 8
-    base = None
-    isbuiltin = 0
-    isnative = 0
-    _metadata = []
+class PandasExtensionMixin(object):
+    """Useful stuff that isn't in the interface"""
     _cache = {}
 
     def __unicode__(self):
@@ -62,17 +46,6 @@ def __repr__(self):
         """
         return str(self)
 
-    def __hash__(self):
-        raise NotImplementedError("sub-classes should implement an __hash__ "
-                                  "method")
-
-    def __eq__(self, other):
-        raise NotImplementedError("sub-classes should implement an __eq__ "
-                                  "method")
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
     def __getstate__(self):
         # pickle support; we don't want to pickle the cache
         return {k: getattr(self, k, None) for k in self._metadata}
@@ -84,9 +57,6 @@ def reset_cache(cls):
 
     @classmethod
     def is_dtype(cls, dtype):
-        """ Return a boolean if the passed type is an actual dtype that
-        we can match (via string or type)
-        """
         if hasattr(dtype, 'dtype'):
             dtype = dtype.dtype
         if isinstance(dtype, np.dtype):
@@ -97,7 +67,7 @@ def is_dtype(cls, dtype):
             return True
         try:
             return cls.construct_from_string(dtype) is not None
-        except:
+        except TypeError:
             return False
 
 
@@ -108,7 +78,7 @@ class CategoricalDtypeType(type):
     pass
 
 
-class CategoricalDtype(ExtensionDtype):
+class CategoricalDtype(PandasExtensionMixin, ExtensionDtype):
     """
     Type for categorical data with the categories and orderedness
 
@@ -387,7 +357,7 @@ class DatetimeTZDtypeType(type):
     pass
 
 
-class DatetimeTZDtype(ExtensionDtype):
+class DatetimeTZDtype(PandasExtensionMixin, ExtensionDtype):
 
     """
     A np.dtype duck-typed class, suitable for holding a custom datetime with tz
@@ -501,7 +471,7 @@ class PeriodDtypeType(type):
     pass
 
 
-class PeriodDtype(ExtensionDtype):
+class PeriodDtype(PandasExtensionMixin):
     __metaclass__ = PeriodDtypeType
     """
     A Period duck-typed class, suitable for holding a period with freq dtype.
@@ -516,6 +486,7 @@ class PeriodDtype(ExtensionDtype):
     _metadata = ['freq']
     _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
     _cache = {}
+    names = None  # TODO inherit and remove
 
     def __new__(cls, freq=None):
         """
@@ -619,7 +590,7 @@ class IntervalDtypeType(type):
     pass
 
 
-class IntervalDtype(ExtensionDtype):
+class IntervalDtype(PandasExtensionMixin, ExtensionDtype):
     __metaclass__ = IntervalDtypeType
     """
     A Interval duck-typed class, suitable for holding an interval
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		""" public toolkit API """
		from . import types, extensions # noqa