Skip to content

Commit

Permalink
ExtensionArray.take default implementation (#20814)
Browse files Browse the repository at this point in the history
Implements a take interface that's compatible with NumPy and optionally pandas'
NA semantics.

Closes #20640
  • Loading branch information
TomAugspurger authored and jorisvandenbossche committed Apr 27, 2018
1 parent 96f2f57 commit 2cbdd9a
Show file tree
Hide file tree
Showing 19 changed files with 460 additions and 81 deletions.
1 change: 1 addition & 0 deletions pandas/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
from pandas.core.accessor import (register_dataframe_accessor, # noqa
register_index_accessor,
register_series_accessor)
from pandas.core.algorithms import take # noqa
from pandas.core.arrays.base import ExtensionArray # noqa
from pandas.core.dtypes.dtypes import ExtensionDtype # noqa
90 changes: 89 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,94 @@ def func(arr, indexer, out, fill_value=np.nan):
return func


def take(arr, indices, allow_fill=False, fill_value=None):
"""
Take elements from an array.
.. versionadded:: 0.23.0
Parameters
----------
arr : sequence
Non array-likes (sequences without a dtype) are coereced
to an ndarray.
indices : sequence of integers
Indices to be taken.
allow_fill : bool, default False
How to handle negative values in `indices`.
* False: negative values in `indices` indicate positional indices
from the right (the default). This is similar to :func:`numpy.take`.
* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
other negative values raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indices when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type (``self.dtype.na_value``) is used.
Returns
-------
ndarray or ExtensionArray
Same type as the input.
Raises
------
IndexError
When `indices` is out of bounds for the array.
ValueError
When the indexer contains negative values other than ``-1``
and `allow_fill` is True.
Notes
-----
When `allow_fill` is False, `indices` may be whatever dimensionality
is accepted by NumPy for `arr`.
When `allow_fill` is True, `indices` should be 1-D.
See Also
--------
numpy.take
Examples
--------
>>> from pandas.api.extensions import take
With the default ``allow_fill=False``, negative numbers indicate
positional indices from the right.
>>> take(np.array([10, 20, 30]), [0, 0, -1])
array([10, 10, 30])
Setting ``allow_fill=True`` will place `fill_value` in those positions.
>>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
array([10., 10., nan])
>>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
... fill_value=-10)
array([ 10, 10, -10])
"""
from pandas.core.indexing import validate_indices

if not is_array_like(arr):
arr = np.asarray(arr)

indices = np.asarray(indices, dtype=np.intp)

if allow_fill:
# Pandas style, -1 means NA
validate_indices(indices, len(arr))
result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value)
else:
# NumPy style
result = arr.take(indices)
return result


def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
allow_fill=True):
"""
Expand All @@ -1462,7 +1550,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
Input array.
indexer : ndarray
1-D array of indices to take, subarrays corresponding to -1 value
indicies are filed with fill_value
indices are filed with fill_value
axis : int, default 0
Axis to take from
out : ndarray or None, default None
Expand Down
96 changes: 61 additions & 35 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,22 +462,36 @@ def factorize(self, na_sentinel=-1):
# ------------------------------------------------------------------------
# Indexing methods
# ------------------------------------------------------------------------
def take(self, indexer, allow_fill=True, fill_value=None):

def take(self, indices, allow_fill=False, fill_value=None):
# type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
"""Take elements from an array.
Parameters
----------
indexer : sequence of integers
indices to be taken. -1 is used to indicate values
that are missing.
allow_fill : bool, default True
If False, indexer is assumed to contain no -1 values so no filling
will be done. This short-circuits computation of a mask. Result is
undefined if allow_fill == False and -1 is present in indexer.
fill_value : any, default None
Fill value to replace -1 values with. If applicable, this should
use the sentinel missing value for this type.
indices : sequence of integers
Indices to be taken.
allow_fill : bool, default False
How to handle negative values in `indices`.
* False: negative values in `indices` indicate positional indices
from the right (the default). This is similar to
:func:`numpy.take`.
* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
other negative values raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indices when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
For many ExtensionArrays, there will be two representations of
`fill_value`: a user-facing "boxed" scalar, and a low-level
physical NA value. `fill_value` should be the user-facing version,
and the implementation should handle translating that to the
physical version for processing the take if nescessary.
Returns
-------
Expand All @@ -486,44 +500,56 @@ def take(self, indexer, allow_fill=True, fill_value=None):
Raises
------
IndexError
When the indexer is out of bounds for the array.
When the indices are out of bounds for the array.
ValueError
When `indices` contains negative values other than ``-1``
and `allow_fill` is True.
Notes
-----
This should follow pandas' semantics where -1 indicates missing values.
Positions where indexer is ``-1`` should be filled with the missing
value for this type.
This gives rise to the special case of a take on an empty
ExtensionArray that does not raises an IndexError straight away
when the `indexer` is all ``-1``.
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
``iloc``, when `indices` is a sequence of values. Additionally,
it's called by :meth:`Series.reindex`, or any other method
that causes realignemnt, with a `fill_value`.
This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
indexer is a sequence of values.
See Also
--------
numpy.take
pandas.api.extensions.take
Examples
--------
Suppose the extension array is backed by a NumPy array stored as
``self.data``. Then ``take`` may be written as
Here's an example implementation, which relies on casting the
extension array to object dtype. This uses the helper method
:func:`pandas.api.extensions.take`.
.. code-block:: python
def take(self, indexer, allow_fill=True, fill_value=None):
indexer = np.asarray(indexer)
mask = indexer == -1
def take(self, indices, allow_fill=False, fill_value=None):
from pandas.core.algorithms import take
# take on empty array not handled as desired by numpy
# in case of -1 (all missing take)
if not len(self) and mask.all():
return type(self)([np.nan] * len(indexer))
# If the ExtensionArray is backed by an ndarray, then
# just pass that here instead of coercing to object.
data = self.astype(object)
result = self.data.take(indexer)
result[mask] = np.nan # NA for this type
return type(self)(result)
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
See Also
--------
numpy.take
# fill value should always be translated from the scalar
# type for the array, to the physical storage type for
# the data, before passing to take.
result = take(data, indices, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
"""
# Implementer note: The `fill_value` parameter should be a user-facing
# value, an instance of self.dtype.type. When passed `fill_value=None`,
# the default of `self.dtype.na_value` should be used.
# This may differ from the physical storage type your ExtensionArray
# uses. In this case, your implementation is responsible for casting
# the user-facing type to the storage type, before using
# pandas.api.extensions.take
raise AbstractMethodError(self)

def copy(self, deep=False):
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ class _DtypeOpsMixin(object):
# classes will inherit from this Mixin. Once everything is compatible, this
# class's methods can be moved to ExtensionDtype and removed.

# na_value is the default NA value to use for this type. This is used in
# e.g. ExtensionArray.take. This should be the user-facing "boxed" version
# of the NA value, not the physical NA vaalue for storage.
# e.g. for JSONArray, this is an empty dictionary.
na_value = np.nan

def __eq__(self, other):
"""Check whether 'other' is equal to self.
Expand Down Expand Up @@ -92,6 +98,8 @@ def is_dtype(cls, dtype):
class ExtensionDtype(_DtypeOpsMixin):
"""A custom data type, to be paired with an ExtensionArray.
.. versionadded:: 0.23.0
Notes
-----
The interface includes the following abstract methods that must
Expand All @@ -101,6 +109,9 @@ class ExtensionDtype(_DtypeOpsMixin):
* name
* construct_from_string
The `na_value` class attribute can be used to set the default NA value
for this type. :attr:`numpy.nan` is used by default.
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ def changeit():


def maybe_promote(dtype, fill_value=np.nan):

# if we passed an array here, determine the fill value by dtype
if isinstance(fill_value, np.ndarray):
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
Expand Down Expand Up @@ -294,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan):
elif is_datetimetz(dtype):
if isna(fill_value):
fill_value = iNaT
elif is_extension_array_dtype(dtype) and isna(fill_value):
fill_value = dtype.na_value
elif is_float(fill_value):
if issubclass(dtype.type, np.bool_):
dtype = np.object_
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,8 @@ def na_value_for_dtype(dtype, compat=True):
"""
dtype = pandas_dtype(dtype)

if is_extension_array_dtype(dtype):
return dtype.na_value
if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):
return NaT
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
allow_dups=False)

def _reindex_columns(self, new_columns, method, copy, level,
fill_value=np.nan, limit=None, tolerance=None):
fill_value=None, limit=None, tolerance=None):
new_columns, indexer = self.columns.reindex(new_columns, method=method,
level=level, limit=limit,
tolerance=tolerance)
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs):
copy = kwargs.pop('copy', True)
limit = kwargs.pop('limit', None)
tolerance = kwargs.pop('tolerance', None)
fill_value = kwargs.pop('fill_value', np.nan)
fill_value = kwargs.pop('fill_value', None)

# Series.reindex doesn't use / need the axis kwarg
# We pop and ignore it here, to make writing Series/Frame generic code
Expand Down Expand Up @@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value):

@Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
limit=None, fill_value=np.nan):
limit=None, fill_value=None):
msg = ("'.reindex_axis' is deprecated and will be removed in a future "
"version. Use '.reindex' instead.")
self._consolidate_inplace()
Expand All @@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
return self._reindex_with_indexers({axis: [new_index, indexer]},
fill_value=fill_value, copy=copy)

def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False,
def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
allow_dups=False):
"""allow_dups indicates an internal call here """

Expand Down Expand Up @@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
raise TypeError('unsupported type: %s' % type(other))

def _align_frame(self, other, join='outer', axis=None, level=None,
copy=True, fill_value=np.nan, method=None, limit=None,
copy=True, fill_value=None, method=None, limit=None,
fill_axis=0):
# defaults
join_index, join_columns = None, None
Expand Down
41 changes: 41 additions & 0 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2417,12 +2417,53 @@ def maybe_convert_indices(indices, n):
mask = indices < 0
if mask.any():
indices[mask] += n

mask = (indices >= n) | (indices < 0)
if mask.any():
raise IndexError("indices are out-of-bounds")
return indices


def validate_indices(indices, n):
"""Perform bounds-checking for an indexer.
-1 is allowed for indicating missing values.
Parameters
----------
indices : ndarray
n : int
length of the array being indexed
Raises
------
ValueError
Examples
--------
>>> validate_indices([1, 2], 3)
# OK
>>> validate_indices([1, -2], 3)
ValueError
>>> validate_indices([1, 2, 3], 3)
IndexError
>>> validate_indices([-1, -1], 0)
# OK
>>> validate_indices([0, 1], 0)
IndexError
"""
if len(indices):
min_idx = indices.min()
if min_idx < -1:
msg = ("'indices' contains values less than allowed ({} < {})"
.format(min_idx, -1))
raise ValueError(msg)

max_idx = indices.max()
if max_idx >= n:
raise IndexError("indices are out-of-bounds")


def maybe_convert_ix(*args):
"""
We likely want to take the cross-product
Expand Down
Loading

0 comments on commit 2cbdd9a

Please sign in to comment.