ExtensionArray.take default implementation (#20814)

Implements a take interface that's compatible with NumPy and optionally pandas' NA semantics. Closes #20640
pandas-dev · Apr 27, 2018 · 2cbdd9a · 2cbdd9a
1 parent 96f2f57
commit 2cbdd9a
Show file tree

Hide file tree

Showing 19 changed files with 460 additions and 81 deletions.
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -2,5 +2,6 @@
 from pandas.core.accessor import (register_dataframe_accessor,  # noqa
                                   register_index_accessor,
                                   register_series_accessor)
+from pandas.core.algorithms import take  # noqa
 from pandas.core.arrays.base import ExtensionArray  # noqa
 from pandas.core.dtypes.dtypes import ExtensionDtype  # noqa
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1448,6 +1448,94 @@ def func(arr, indexer, out, fill_value=np.nan):
     return func
 
 
+def take(arr, indices, allow_fill=False, fill_value=None):
+    """
+    Take elements from an array.
+
+    .. versionadded:: 0.23.0
+
+    Parameters
+    ----------
+    arr : sequence
+        Non array-likes (sequences without a dtype) are coereced
+        to an ndarray.
+    indices : sequence of integers
+        Indices to be taken.
+    allow_fill : bool, default False
+        How to handle negative values in `indices`.
+
+        * False: negative values in `indices` indicate positional indices
+          from the right (the default). This is similar to :func:`numpy.take`.
+
+        * True: negative values in `indices` indicate
+          missing values. These values are set to `fill_value`. Any other
+          other negative values raise a ``ValueError``.
+
+    fill_value : any, optional
+        Fill value to use for NA-indices when `allow_fill` is True.
+        This may be ``None``, in which case the default NA value for
+        the type (``self.dtype.na_value``) is used.
+
+    Returns
+    -------
+    ndarray or ExtensionArray
+        Same type as the input.
+
+    Raises
+    ------
+    IndexError
+        When `indices` is out of bounds for the array.
+    ValueError
+        When the indexer contains negative values other than ``-1``
+        and `allow_fill` is True.
+
+    Notes
+    -----
+    When `allow_fill` is False, `indices` may be whatever dimensionality
+    is accepted by NumPy for `arr`.
+
+    When `allow_fill` is True, `indices` should be 1-D.
+
+    See Also
+    --------
+    numpy.take
+
+    Examples
+    --------
+    >>> from pandas.api.extensions import take
+
+    With the default ``allow_fill=False``, negative numbers indicate
+    positional indices from the right.
+
+    >>> take(np.array([10, 20, 30]), [0, 0, -1])
+    array([10, 10, 30])
+
+    Setting ``allow_fill=True`` will place `fill_value` in those positions.
+
+    >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
+    array([10., 10., nan])
+
+    >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
+    ...      fill_value=-10)
+    array([ 10,  10, -10])
+    """
+    from pandas.core.indexing import validate_indices
+
+    if not is_array_like(arr):
+        arr = np.asarray(arr)
+
+    indices = np.asarray(indices, dtype=np.intp)
+
+    if allow_fill:
+        # Pandas style, -1 means NA
+        validate_indices(indices, len(arr))
+        result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value)
+    else:
+        # NumPy style
+        result = arr.take(indices)
+    return result
+
+
 def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
             allow_fill=True):
     """
@@ -1462,7 +1550,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
         Input array.
     indexer : ndarray
         1-D array of indices to take, subarrays corresponding to -1 value
-        indicies are filed with fill_value
+        indices are filed with fill_value
     axis : int, default 0
         Axis to take from
     out : ndarray or None, default None

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -462,22 +462,36 @@ def factorize(self, na_sentinel=-1):
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------
-    def take(self, indexer, allow_fill=True, fill_value=None):
+
+    def take(self, indices, allow_fill=False, fill_value=None):
         # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
         """Take elements from an array.
 
         Parameters
         ----------
-        indexer : sequence of integers
-            indices to be taken. -1 is used to indicate values
-            that are missing.
-        allow_fill : bool, default True
-            If False, indexer is assumed to contain no -1 values so no filling
-            will be done. This short-circuits computation of a mask. Result is
-            undefined if allow_fill == False and -1 is present in indexer.
-        fill_value : any, default None
-            Fill value to replace -1 values with. If applicable, this should
-            use the sentinel missing value for this type.
+        indices : sequence of integers
+            Indices to be taken.
+        allow_fill : bool, default False
+            How to handle negative values in `indices`.
+
+            * False: negative values in `indices` indicate positional indices
+              from the right (the default). This is similar to
+              :func:`numpy.take`.
+
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+
+        fill_value : any, optional
+            Fill value to use for NA-indices when `allow_fill` is True.
+            This may be ``None``, in which case the default NA value for
+            the type, ``self.dtype.na_value``, is used.
+
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if nescessary.
 
         Returns
         -------
@@ -486,44 +500,56 @@ def take(self, indexer, allow_fill=True, fill_value=None):
         Raises
         ------
         IndexError
-            When the indexer is out of bounds for the array.
+            When the indices are out of bounds for the array.
+        ValueError
+            When `indices` contains negative values other than ``-1``
+            and `allow_fill` is True.
 
         Notes
         -----
-        This should follow pandas' semantics where -1 indicates missing values.
-        Positions where indexer is ``-1`` should be filled with the missing
-        value for this type.
-        This gives rise to the special case of a take on an empty
-        ExtensionArray that does not raises an IndexError straight away
-        when the `indexer` is all ``-1``.
+        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignemnt, with a `fill_value`.
 
-        This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
-        indexer is a sequence of values.
+        See Also
+        --------
+        numpy.take
+        pandas.api.extensions.take
 
         Examples
         --------
-        Suppose the extension array is backed by a NumPy array stored as
-        ``self.data``. Then ``take`` may be written as
+        Here's an example implementation, which relies on casting the
+        extension array to object dtype. This uses the helper method
+        :func:`pandas.api.extensions.take`.
 
         .. code-block:: python
 
-           def take(self, indexer, allow_fill=True, fill_value=None):
-               indexer = np.asarray(indexer)
-               mask = indexer == -1
+           def take(self, indices, allow_fill=False, fill_value=None):
+               from pandas.core.algorithms import take
 
-               # take on empty array not handled as desired by numpy
-               # in case of -1 (all missing take)
-               if not len(self) and mask.all():
-                   return type(self)([np.nan] * len(indexer))
+               # If the ExtensionArray is backed by an ndarray, then
+               # just pass that here instead of coercing to object.
+               data = self.astype(object)
 
-               result = self.data.take(indexer)
-               result[mask] = np.nan  # NA for this type
-               return type(self)(result)
+               if allow_fill and fill_value is None:
+                   fill_value = self.dtype.na_value
 
-        See Also
-        --------
-        numpy.take
+               # fill value should always be translated from the scalar
+               # type for the array, to the physical storage type for
+               # the data, before passing to take.
+
+               result = take(data, indices, fill_value=fill_value,
+                             allow_fill=allow_fill)
+               return self._from_sequence(result)
         """
+        # Implementer note: The `fill_value` parameter should be a user-facing
+        # value, an instance of self.dtype.type. When passed `fill_value=None`,
+        # the default of `self.dtype.na_value` should be used.
+        # This may differ from the physical storage type your ExtensionArray
+        # uses. In this case, your implementation is responsible for casting
+        # the user-facing type to the storage type, before using
+        # pandas.api.extensions.take
         raise AbstractMethodError(self)
 
     def copy(self, deep=False):

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -16,6 +16,12 @@ class _DtypeOpsMixin(object):
     # classes will inherit from this Mixin. Once everything is compatible, this
     # class's methods can be moved to ExtensionDtype and removed.
 
+    # na_value is the default NA value to use for this type. This is used in
+    # e.g. ExtensionArray.take. This should be the user-facing "boxed" version
+    # of the NA value, not the physical NA vaalue for storage.
+    # e.g. for JSONArray, this is an empty dictionary.
+    na_value = np.nan
+
     def __eq__(self, other):
         """Check whether 'other' is equal to self.
 
@@ -92,6 +98,8 @@ def is_dtype(cls, dtype):
 class ExtensionDtype(_DtypeOpsMixin):
     """A custom data type, to be paired with an ExtensionArray.
 
+    .. versionadded:: 0.23.0
+
     Notes
     -----
     The interface includes the following abstract methods that must
@@ -101,6 +109,9 @@ class ExtensionDtype(_DtypeOpsMixin):
     * name
     * construct_from_string
 
+    The `na_value` class attribute can be used to set the default NA value
+    for this type. :attr:`numpy.nan` is used by default.
+
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
     ``pandas.errors.AbstractMethodError`` and no ``register`` method is

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -255,7 +255,6 @@ def changeit():
 
 
 def maybe_promote(dtype, fill_value=np.nan):
-
     # if we passed an array here, determine the fill value by dtype
     if isinstance(fill_value, np.ndarray):
         if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
@@ -294,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan):
     elif is_datetimetz(dtype):
         if isna(fill_value):
             fill_value = iNaT
+    elif is_extension_array_dtype(dtype) and isna(fill_value):
+        fill_value = dtype.na_value
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
             dtype = np.object_

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -502,6 +502,8 @@ def na_value_for_dtype(dtype, compat=True):
     """
     dtype = pandas_dtype(dtype)
 
+    if is_extension_array_dtype(dtype):
+        return dtype.na_value
     if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
             is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):
         return NaT

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
                                            allow_dups=False)
 
     def _reindex_columns(self, new_columns, method, copy, level,
-                         fill_value=np.nan, limit=None, tolerance=None):
+                         fill_value=None, limit=None, tolerance=None):
         new_columns, indexer = self.columns.reindex(new_columns, method=method,
                                                     level=level, limit=limit,
                                                     tolerance=tolerance)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs):
         copy = kwargs.pop('copy', True)
         limit = kwargs.pop('limit', None)
         tolerance = kwargs.pop('tolerance', None)
-        fill_value = kwargs.pop('fill_value', np.nan)
+        fill_value = kwargs.pop('fill_value', None)
 
         # Series.reindex doesn't use / need the axis kwarg
         # We pop and ignore it here, to make writing Series/Frame generic code
@@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value):
 
     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
     def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
-                     limit=None, fill_value=np.nan):
+                     limit=None, fill_value=None):
         msg = ("'.reindex_axis' is deprecated and will be removed in a future "
                "version. Use '.reindex' instead.")
         self._consolidate_inplace()
@@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
         return self._reindex_with_indexers({axis: [new_index, indexer]},
                                            fill_value=fill_value, copy=copy)
 
-    def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False,
+    def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
                                allow_dups=False):
         """allow_dups indicates an internal call here """
 
@@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
             raise TypeError('unsupported type: %s' % type(other))
 
     def _align_frame(self, other, join='outer', axis=None, level=None,
-                     copy=True, fill_value=np.nan, method=None, limit=None,
+                     copy=True, fill_value=None, method=None, limit=None,
                      fill_axis=0):
         # defaults
         join_index, join_columns = None, None

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -2417,12 +2417,53 @@ def maybe_convert_indices(indices, n):
     mask = indices < 0
     if mask.any():
         indices[mask] += n
+
     mask = (indices >= n) | (indices < 0)
     if mask.any():
         raise IndexError("indices are out-of-bounds")
     return indices
 
 
+def validate_indices(indices, n):
+    """Perform bounds-checking for an indexer.
+
+    -1 is allowed for indicating missing values.
+
+    Parameters
+    ----------
+    indices : ndarray
+    n : int
+        length of the array being indexed
+
+    Raises
+    ------
+    ValueError
+
+    Examples
+    --------
+    >>> validate_indices([1, 2], 3)
+    # OK
+    >>> validate_indices([1, -2], 3)
+    ValueError
+    >>> validate_indices([1, 2, 3], 3)
+    IndexError
+    >>> validate_indices([-1, -1], 0)
+    # OK
+    >>> validate_indices([0, 1], 0)
+    IndexError
+    """
+    if len(indices):
+        min_idx = indices.min()
+        if min_idx < -1:
+            msg = ("'indices' contains values less than allowed ({} < {})"
+                   .format(min_idx, -1))
+            raise ValueError(msg)
+
+        max_idx = indices.max()
+        if max_idx >= n:
+            raise IndexError("indices are out-of-bounds")
+
+
 def maybe_convert_ix(*args):
     """
     We likely want to take the cross-product