From 9b8564f24e837e099edeea4e0685e81644117384 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Mar 2018 09:16:54 -0500 Subject: [PATCH 01/22] REF: IntervalIndex[IntervalArray] Closes #19209 --- doc/source/basics.rst | 22 +- doc/source/whatsnew/v0.23.0.txt | 71 ++ pandas/core/arrays/__init__.py | 12 +- pandas/core/arrays/categorical.py | 3 + pandas/core/arrays/interval.py | 838 ++++++++++++++++++ pandas/core/dtypes/dtypes.py | 8 +- pandas/core/indexes/base.py | 3 +- pandas/core/indexes/interval.py | 666 +++----------- pandas/core/util/hashing.py | 4 +- pandas/io/packers.py | 17 +- pandas/tests/dtypes/test_dtypes.py | 6 +- pandas/tests/extension/test_common.py | 2 +- pandas/tests/extension/test_interval.py | 122 +++ pandas/tests/indexes/common.py | 4 +- pandas/tests/indexes/interval/test_astype.py | 2 +- .../indexes/interval/test_construction.py | 4 +- .../tests/indexes/interval/test_interval.py | 22 +- pandas/tests/test_base.py | 4 +- pandas/tests/util/test_testing.py | 16 + pandas/util/exceptions.py | 16 + pandas/util/testing.py | 35 +- 21 files changed, 1295 insertions(+), 582 deletions(-) create mode 100644 pandas/core/arrays/interval.py create mode 100644 pandas/tests/extension/test_interval.py create mode 100644 pandas/util/exceptions.py diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 8d09f1fc04c1f..91165a86c0ae4 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1925,11 +1925,23 @@ untouched. If the data is modified, it is because you did so explicitly. dtypes ------ -The main types stored in pandas objects are ``float``, ``int``, ``bool``, -``datetime64[ns]`` and ``datetime64[ns, tz]``, ``timedelta[ns]``, -``category`` and ``object``. In addition these dtypes have item sizes, e.g. -``int64`` and ``int32``. See :ref:`Series with TZ ` -for more detail on ``datetime64[ns, tz]`` dtypes. +For the most part, pandas uses NumPy arrays and dtypes for Series or individual +columns of a DataFrame. The main types allowed in pandas objects are ``float``, +``int``, ``bool``, and ``datetime64[ns]`` (note that NumPy does not support +timezone-aware datetimes). + +In addition to NumPy's types, pandas :ref:`extends ` +NumPy's type-system for a few cases. + +* :ref:`Categorical ` +* :ref:`Datetime with Timezone ` +* Interval + +Pandas uses the ``object`` dtype for storing strings. + +Finally, arbitrary objects may be stored using the ``object`` dtype, but should +be avoided to the extent possible (for performance and interoperability with +other libraries and methods. See :ref:`basics.object_conversion`). A convenient :attr:`~DataFrame.dtypes` attribute for DataFrame returns a Series with the data type of each column. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 09bd09b06d9b9..3f04743c75347 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -299,6 +299,41 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist df['A'].dtype df['B'].dtype +.. _whatsnew_023.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a Series or DataFrame, in addition to an +:class:`IntervalIndex` like before. + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of Interval objects. In general, +this should result in better performance when storing an array of intervals in +a Series. + +Note that the ``.values`` of a Series containing intervals is no longer a NumPy +array. Rather, it's an ``ExtensionArray``, composed of two arrays ``left`` and +``right``. + +.. ipython:: python + + ser.values + +To recover the NumPy array of Interval objects, use :func:`numpy.asarray`: + +.. ipython:: python + + np.asarray(ser.values) + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0230.api_breaking.interval_values` for more. + .. _whatsnew_023.enhancements.extension: Extending Pandas with Custom Types @@ -479,6 +514,42 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use 'Taxes': -200, 'Net result': 300}).sort_index() +.. _whatsnew_0230.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``.values`` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of Interval objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + idx.values.astype(object) + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index f8adcf520c15b..16f2cb9bd8669 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,2 +1,10 @@ -from .base import ExtensionArray # noqa -from .categorical import Categorical # noqa +from .base import ExtensionArray +from .categorical import Categorical +from .interval import IntervalArray + + +__all__ = [ + 'Categorical', + 'ExtensionArray', + 'IntervalArray', +] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b5a4785fd98a6..086bf231cb4cb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -19,6 +19,7 @@ _ensure_int64, _ensure_object, _ensure_platform_int, + is_extension_array_dtype, is_dtype_equal, is_datetimelike, is_datetime64_dtype, @@ -1218,6 +1219,8 @@ def __array__(self, dtype=None): ret = take_1d(self.categories.values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) + if is_extension_array_dtype(ret): + ret = np.asarray(ret) return ret def __setstate__(self, state): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py new file mode 100644 index 0000000000000..b793eb5a59aae --- /dev/null +++ b/pandas/core/arrays/interval.py @@ -0,0 +1,838 @@ +import textwrap +import numpy as np + +from pandas._libs.interval import (Interval, IntervalMixin, + intervals_to_interval_bounds) +from pandas.compat.numpy import function as nv +from pandas.core.common import _all_not_none, _asarray_tuplesafe +from pandas.core.config import get_option +from pandas.core.dtypes.cast import maybe_convert_platform +from pandas.core.dtypes.common import (_ensure_platform_int, + is_categorical_dtype, is_float_dtype, + is_integer_dtype, is_interval_dtype, + is_scalar, is_string_dtype, + pandas_dtype) +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, + ABCSeries) +from pandas.core.dtypes.missing import isna, notna +from pandas.core.indexes.base import Index, _ensure_index +from pandas.util._decorators import Appender + +from . import ExtensionArray, Categorical + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) +_interval_shared_docs = {} +_shared_docs_kwargs = dict( + klass='IntervalArray', + name='' +) + + +_interval_shared_docs['class'] = """%(summary)s + +.. versionadded:: %(versionadded)s + +.. warning:: + + The indexing behaviors are provisional and may change in + a future version of pandas. + +Parameters +---------- +data : array-like (1-dimensional) + Array-like containing Interval objects from which to build the + %(klass)s. +closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both or + neither. +%(name)s\ +copy : boolean, default False + Copy the meta-data. +dtype : dtype or None, default None + If None, dtype will be inferred + + .. versoinadded:: 0.23.0 + +Attributes +---------- +left +right +closed +mid +length +values +is_non_overlapping_monotonic + +Methods +------- +from_arrays +from_tuples +from_breaks +%(extra_methods)s\ + +%(examples)s\ + +Notes +------ +See the `user guide +`_ +for more. + +See Also +-------- +Index : The base pandas Index type +Interval : A bounded slice-like interval; the elements of an IntervalIndex +interval_range : Function to create a fixed frequency IntervalIndex +cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Intervals +""" + + +@Appender(_interval_shared_docs['class'] % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side", + versionadded="0.23.0", + name='', extra_methods='', examples='', +)) +class IntervalArray(IntervalMixin, ExtensionArray): + dtype = IntervalDtype() + ndim = 1 + can_hold_na = True + _na_value = _fill_value = np.nan + + def __new__(cls, data, closed=None, dtype=None, copy=False, + fastpath=False, verify_integrity=True): + + from pandas.core.indexes.interval import IntervalIndex + + if fastpath: + return cls._simple_new(data.left, data.right, closed, + copy=copy, dtype=dtype, + verify_integrity=False) + + if isinstance(data, ABCSeries) and is_interval_dtype(data): + data = data.values + if isinstance(data, (cls, IntervalIndex)): + left = data.left + right = data.right + closed = data.closed + else: + + # don't allow scalars + if is_scalar(data): + msg = ("{}(...) must be called with a collection of some kind," + " {} was passed") + raise TypeError(msg.format(cls.__name__, data)) + + data = maybe_convert_platform_interval(data) + left, right, infer_closed = intervals_to_interval_bounds(data) + + if _all_not_none(closed, infer_closed) and closed != infer_closed: + # GH 18421 + msg = ("conflicting values for closed: constructor got " + "'{closed}', inferred from data '{infer_closed}'" + .format(closed=closed, infer_closed=infer_closed)) + raise ValueError(msg) + + closed = closed or infer_closed + + return cls._simple_new(left, right, closed, copy=copy, dtype=dtype, + verify_integrity=verify_integrity) + + @classmethod + def _simple_new(cls, left, right, closed=None, + copy=False, dtype=None, verify_integrity=True): + result = IntervalMixin.__new__(cls) + + closed = closed or 'right' + left = _ensure_index(left, copy=copy) + right = _ensure_index(right, copy=copy) + + if dtype is not None: + # GH 19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = 'dtype must be an IntervalDtype, got {dtype}' + raise TypeError(msg.format(dtype=dtype)) + elif dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ('must not have differing left [{ltype}] and right ' + '[{rtype}] types') + raise ValueError(msg.format(ltype=type(left).__name__, + rtype=type(right).__name__)) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ('category, object, and string subtypes are not supported ' + 'for IntervalIndex') + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = 'Period dtypes are not supported, use a PeriodIndex instead' + raise ValueError(msg) + elif (isinstance(left, ABCDatetimeIndex) and + str(left.tz) != str(right.tz)): + msg = ("left and right must have the same time zone, got " + "'{left_tz}' and '{right_tz}'") + raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) + + result._left = left + result._right = right + result._closed = closed + if verify_integrity: + result._validate() + return result + + @classmethod + def _constructor_from_sequence(cls, scalars): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, closed=original.closed) + + _interval_shared_docs['from_breaks'] = """ + Construct an %(klass)s from an array of splits. + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + copy the data + dtype : dtype or None, default None + If None, dtype will be inferred + + .. versionadded:: 0.23.0 + + Examples + -------- + >>> pd.%(klass)s.from_breaks([0, 1, 2, 3]) + %(klass)s([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct from a left and right array + %(klass)s.from_tuples : Construct from a sequence of tuples + """ + + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs) + def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): + breaks = maybe_convert_platform_interval(breaks) + + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, + dtype=dtype) + + _interval_shared_docs['from_arrays'] = """ + Construct from two arrays defining the left and right bounds. + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)s + + Notes + ----- + Each element of `left` must be less than or equal to the `right` + element at the same position. If an element is missing, it must be + missing in both `left` and `right`. A TypeError is raised when + using an unsupported type for `left` or `right`. At the moment, + 'category', 'object', and 'string' subtypes are not supported. + + Raises + ------ + ValueError + When a value is missing in only one of `left` or `right`. + When a value in `left` is greater than the corresponding value + in `right`. + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_breaks : Construct an IntervalIndex from an array of + splits. + %(klass)s.from_tuples : Construct an IntervalIndex from a + list/array of tuples. + + + Examples + -------- + >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3]) + %(klass)s([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + """ + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs) + def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): + left = maybe_convert_platform_interval(left) + right = maybe_convert_platform_interval(right) + + return cls._simple_new(left, right, closed, copy=copy, + dtype=dtype, verify_integrity=True) + + _interval_shared_docs['from_intervals'] = """ + Construct an %(klass)s from a 1d array of Interval objects + + .. deprecated:: 0.23.0 + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + dtype : dtype or None, default None + If None, dtype will be inferred + + ..versionadded:: 0.23.0 + + Examples + -------- + >>> pd.%(klass)s.from_intervals([pd.Interval(0, 1), + ... pd.Interval(1, 2)]) + %(klass)s([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) + %(klass)s([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits + %(klass)s.from_tuples : Construct an %(klass)s from a + list/array of tuples + """ + + _interval_shared_docs['from_tuples'] = """ + Construct an %(klass)s from a list/array of tuples + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + dtype : dtype or None, default None + If None, dtype will be inferred + + ..versionadded:: 0.23.0 + + + Examples + -------- + >>> pd.%(klass)s.from_tuples([(0, 1), (1, 2)]) + %(klass)s([(0, 1], (1, 2]], + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits + """ + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs) + def from_tuples(cls, data, closed='right', copy=False, dtype=None): + if len(data): + left, right = [], [] + else: + left = right = data + + for d in data: + if isna(d): + lhs = rhs = np.nan + else: + name = cls.__name__ + try: + # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] + lhs, rhs = d + except ValueError: + msg = ('{name}.from_tuples requires tuples of ' + 'length 2, got {tpl}').format(name=name, tpl=d) + raise ValueError(msg) + except TypeError: + msg = ('{name}.from_tuples received an invalid ' + 'item, {tpl}').format(name=name, tpl=d) + raise TypeError(msg) + lhs, rhs = d + left.append(lhs) + right.append(rhs) + + return cls.from_arrays(left, right, closed, copy=False, + dtype=dtype) + + def _validate(self): + """Verify that the IntervalIndex is valid. + + Checks that + + * closed is valid + * left and right match lengths + * left and right have the same missing values + * left is always below right + """ + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid option for 'closed': {closed}" + .format(closed=self.closed)) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_mask = notna(self.left) + right_mask = notna(self.right) + if not (left_mask == right_mask).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_mask] <= self.right[left_mask]).all(): + raise ValueError('left side of interval must be <= right side') + + # --------- + # Interface + # --------- + def __iter__(self): + return iter(self.values) + + def __len__(self): + return len(self.left) + + def __getitem__(self, value): + mask = self.isna()[value] + if is_scalar(mask) and mask: + return self._fill_value + + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, Index): + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + def fillna(self, value=None, method=None, limit=None): + if method is not None: + raise TypeError('Filling by method is not supported for ' + 'IntervalArray.') + if limit is not None: + raise TypeError('limit is not supported for IntervalArray.') + + if not isinstance(value, Interval): + msg = ("Interval.fillna only supports filling with scalar " + "'value'. Got a '{}' instead of a 'pandas.Interval'." + .format(type(value).__name__)) + raise TypeError(msg) + + value = getattr(value, '_values', value) + + left = self.left.fillna(value=value.left) + right = self.right.fillna(value=value.right) + return self._shallow_copy(left, right) + + @property + def dtype(self): + return IntervalDtype.construct_from_string(str(self.left.dtype)) + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if is_interval_dtype(dtype) and dtype != self.dtype: + try: + new_left = self.left.astype(dtype.subtype) + new_right = self.right.astype(dtype.subtype) + except TypeError: + msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' + 'incompatible') + raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) + return self._shallow_copy(new_left, new_right) + elif is_interval_dtype(dtype): + if copy: + return self.copy() + else: + return self + elif is_categorical_dtype(dtype): + return Categorical(self.values) + # TODO: This try/except will be repeated. + try: + return self.values.astype(dtype, copy=copy) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + + @classmethod + def _concat_same_type(cls, to_concat): + closed = set(interval.closed for interval in to_concat) + if len(closed) != 1: + raise ValueError("Intervals must all be closed on the same side.") + closed = closed.pop() + + # TODO: avoid intermediate list + left = np.concatenate([interval.left for interval in to_concat]) + right = np.concatenate([interval.right for interval in to_concat]) + return cls._simple_new(left, right, closed=closed, copy=False) + + def _shallow_copy(self, left=None, right=None): + from pandas.core.indexes.interval import IntervalIndex + + if left is None: + + # no values passed + # XXX: is ^ right? Or does that mean just left wasn't passed? + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalIndex + # or array of Intervals + if not isinstance(left, (type(self), IntervalIndex)): + left = type(self)(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + return self._simple_new(left, right, closed=self.closed, + verify_integrity=False) + + # TODO: doc + def copy(self, deep=False): + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + closed = self.closed + # TODO: Could skip verify_integrity here. + return type(self).from_arrays(left, right, closed=closed) + + def _formatting_values(self): + return self.values + + def get_values(self): + return self.values + + def isna(self): + return isna(self.left) + + @property + def nbytes(self): + return self.left.nbytes + self.right.nbytes + + @property + def itemsize(self): + return self.left.itemsize + self.right.itemsize + + def take(self, indices, axis=0, allow_fill=True, fill_value=None, + **kwargs): + nv.validate_take(tuple(), kwargs) + indices = _ensure_platform_int(indices) + left, right = self.left, self.right + + if fill_value is None: + fill_value = self._na_value + mask = indices == -1 + + if not mask.any(): + # we won't change dtype here in this case + # if we don't need + allow_fill = False + + taker = lambda x: x.take(indices, allow_fill=allow_fill, + fill_value=fill_value) + + try: + new_left = taker(left) + new_right = taker(right) + except ValueError: + + # we need to coerce; migth have NA's in an + # integer dtype + new_left = taker(left.astype(float)) + new_right = taker(right.astype(float)) + + return self._shallow_copy(new_left, new_right) + + take_nd = take + + # Formatting + + def _format_data(self): + + # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical + n = len(self) + max_seq_items = min((get_option( + 'display.max_seq_items') or n) // 10, 10) + + formatter = str + + if n == 0: + summary = '[]' + elif n == 1: + first = formatter(self[0]) + summary = '[{first}]'.format(first=first) + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[{first}, {last}]'.format(first=first, last=last) + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + summary = '[{head} ... {tail}]'.format( + head=', '.join(head), tail=', '.join(tail)) + else: + head = [] + tail = [formatter(x) for x in self] + summary = '[{tail}]'.format(tail=', '.join(tail)) + + return summary + + def __repr__(self): + tpl = textwrap.dedent("""\ + {cls}({data}, + {lead}closed='{closed}', + {lead}dtype='{dtype}')""") + return tpl.format(cls=self.__class__.__name__, + data=self._format_data(), + lead=' ' * len(self.__class__.__name__) + ' ', + closed=self.closed, dtype=self.dtype) + + def _format_space(self): + space = ' ' * (len(self.__class__.__name__) + 1) + return "\n{space}".format(space=space) + + @property + def left(self): + """ + Return the left endpoints of each Interval in the IntervalIndex as + an Index + """ + return self._left + + @property + def right(self): + """ + Return the right endpoints of each Interval in the IntervalIndex as + an Index + """ + return self._right + + @property + def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither + """ + return self._closed + + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalIndex + """ + try: + return self.right - self.left + except TypeError: + # length not defined for some types, e.g. string + msg = ('IntervalIndex contains Intervals without defined length, ' + 'e.g. Intervals with string endpoints') + raise TypeError(msg) + + @property + def mid(self): + """ + Return the midpoint of each Interval in the IntervalIndex as an Index + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + return self.left.shape + + @property + def itemsize(self): + return self.left.itemsize + self.right.itemsize + + @property + def is_non_overlapping_monotonic(self): + """ + Return True if the IntervalIndex is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False + """ + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + + # strict inequality for closed == 'both'; equality implies overlapping + # at a point when both sides of intervals are included + if self.closed == 'both': + return bool((self.right[:-1] < self.left[1:]).all() or + (self.left[:-1] > self.right[1:]).all()) + + # non-strict inequality when closed != 'both'; at least one side is + # not included in the intervals, so equality does not imply overlapping + return bool((self.right[:-1] <= self.left[1:]).all() or + (self.left[:-1] >= self.right[1:]).all()) + + # Conversion + @property + def values(self): + """ + Return the IntervalIndex's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self.isna() + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + _interval_shared_docs['to_tuples'] = """\ + Return an %(return_type)s of tuples of the form (left, right) + + Parameters + ---------- + na_tuple : boolean, default True + Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA + value itself if False, ``nan``. + + ..versionadded:: 0.23.0 + + Returns + ------- + tuples: %(return_type)s + %(examples)s\ + """ + + @Appender(_interval_shared_docs['to_tuples'] % dict( + return_type='ndarray', + examples='', + )) + def to_tuples(self, na_tuple=True): + tuples = _asarray_tuplesafe(zip(self.left, self.right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self.isna(), tuples, np.nan) + return tuples + + def tolist(self): + """ + Return a list of Interval objects. + + See Also + -------- + numpy.ndarray.tolist + """ + # TODO: think about putting this in a parent + return self.values.tolist() + + def repeat(self, repeats): + """Repeat elements of an IntervalArray + + Parameters + ---------- + repeats : int + Number of repetitions for each element. + + Returns + ------- + IntervalArray + + See Also + -------- + numpy.repeat + """ + return self._simple_new( + self.left.repeat(repeats), + self.right.repeat(repeats), + closed=self.closed + ) + +# TODO: find a home + + +def maybe_convert_platform_interval(values): + """ + Try to do platform conversion, with special casing for IntervalIndex. + Wrapper around maybe_convert_platform that alters the default return + dtype in certain cases to be compatible with IntervalIndex. For example, + empty lists return with integer dtype instead of object dtype, which is + prohibited for IntervalIndex. + + Parameters + ---------- + values : array-like + + Returns + ------- + array + """ + if isinstance(values, (list, tuple)) and len(values) == 0: + # GH 19016 + # empty lists/tuples get object dtype by default, but this is not + # prohibited for IntervalIndex, so coerce to integer instead + return np.array([], dtype=np.int64) + return maybe_convert_platform(values) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 708f54f5ca75b..51c25936b3a84 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -611,14 +611,13 @@ class IntervalDtypeType(type): pass -class IntervalDtype(PandasExtensionDtype): +class IntervalDtype(PandasExtensionDtype, ExtensionDtype): """ A Interval duck-typed class, suitable for holding an interval THIS IS NOT A REAL NUMPY DTYPE """ name = 'interval' - type = IntervalDtypeType kind = None str = '|O08' base = np.dtype('O') @@ -683,6 +682,11 @@ def construct_from_string(cls, string): msg = "a string needs to be passed, got type {typ}" raise TypeError(msg.format(typ=type(string))) + @property + def type(self): + from pandas import Interval + return Interval + def __unicode__(self): if self.subtype is None: return "interval" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12bb09e8f8a8a..4170416f76c3a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -270,7 +270,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, **kwargs) # interval - if is_interval_dtype(data) or is_interval_dtype(dtype): + if ((is_interval_dtype(data) or is_interval_dtype(dtype)) and + not is_object_dtype(dtype)): from .interval import IntervalIndex closed = kwargs.get('closed', None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 54800d0d76d2e..c5ece41e0265c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,19 +1,16 @@ """ define the IntervalIndex """ +import textwrap +import warnings import numpy as np -import warnings -from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.generic import ABCDatetimeIndex, ABCPeriodIndex -from pandas.core.dtypes.dtypes import IntervalDtype -from pandas.core.dtypes.cast import maybe_convert_platform, find_common_type +from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_datetime_or_timedelta_dtype, is_datetime64tz_dtype, - is_categorical_dtype, - is_string_dtype, is_integer_dtype, is_float_dtype, is_interval_dtype, @@ -21,8 +18,7 @@ is_scalar, is_float, is_number, - is_integer, - pandas_dtype) + is_integer) from pandas.core.indexes.base import ( Index, _ensure_index, default_pprint, _index_shared_docs) @@ -30,26 +26,31 @@ from pandas._libs import Timestamp, Timedelta from pandas._libs.interval import ( Interval, IntervalMixin, IntervalTree, - intervals_to_interval_bounds) +) from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex -from pandas.compat.numpy import function as nv import pandas.core.common as com from pandas.util._decorators import cache_readonly, Appender +from pandas.util.exceptions import rewrite_exception from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset import pandas.core.indexes.base as ibase +from pandas.core.arrays.interval import (IntervalArray, + _interval_shared_docs) + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='IntervalIndex', - target_klass='IntervalIndex or list of Intervals')) - - -_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + target_klass='IntervalIndex or list of Intervals', + name=textwrap.dedent("""\ + name : object, optional + to be stored in the index. + """), + )) def _get_next_label(label): @@ -95,30 +96,6 @@ def _get_interval_closed_bounds(interval): return left, right -def maybe_convert_platform_interval(values): - """ - Try to do platform conversion, with special casing for IntervalIndex. - Wrapper around maybe_convert_platform that alters the default return - dtype in certain cases to be compatible with IntervalIndex. For example, - empty lists return with integer dtype instead of object dtype, which is - prohibited for IntervalIndex. - - Parameters - ---------- - values : array-like - - Returns - ------- - array - """ - if isinstance(values, (list, tuple)) and len(values) == 0: - # GH 19016 - # empty lists/tuples get object dtype by default, but this is not - # prohibited for IntervalIndex, so coerce to integer instead - return np.array([], dtype=np.int64) - return maybe_convert_platform(values) - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -127,55 +104,16 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class IntervalIndex(IntervalMixin, Index): - """ - Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of Interval objects that are all closed on the same - side. - - .. versionadded:: 0.20.0 - - .. warning:: - - The indexing behaviors are provisional and may change in - a future version of pandas. - - Parameters - ---------- - data : array-like (1-dimensional) - Array-like containing Interval objects from which to build the - IntervalIndex - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both or - neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - Copy the meta-data - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - Attributes - ---------- - left - right - closed - mid - length - values - is_non_overlapping_monotonic - - Methods - ------- - from_arrays - from_tuples - from_breaks - contains +@Appender(_interval_shared_docs['class'] % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs['name'], + versionadded="0.20.0", + extra_methods="contains\n", + examples=textwrap.dedent("""\ Examples - --------- + -------- A new ``IntervalIndex`` is typically constructed using :func:`interval_range`: @@ -189,161 +127,63 @@ class IntervalIndex(IntervalMixin, Index): See further examples in the doc strings of ``interval_range`` and the mentioned constructor methods. + """), - Notes - ------ - See the `user guide - `_ - for more. - - See Also - -------- - Index : The base pandas Index type - Interval : A bounded slice-like interval; the elements of an IntervalIndex - interval_range : Function to create a fixed frequency IntervalIndex - cut, qcut : Convert arrays of continuous data into Categoricals/Series of - Intervals - """ +)) +class IntervalIndex(IntervalMixin, Index): _typ = 'intervalindex' _comparables = ['name'] _attributes = ['name', 'closed'] + _allow_index_ops = True + _exception_rewrite = lambda: rewrite_exception('IntervalArray', + 'IntervalIndex') # we would like our indexing holder to defer to us _defer_to_indexing = True + # Immutable, so we are able to cache computations like isna in '_mask' _mask = None def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, fastpath=False, verify_integrity=True): if fastpath: - return cls._simple_new(data.left, data.right, closed, name, - copy=copy, verify_integrity=False) + return cls._simple_new(data, name) if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, IntervalIndex): - left = data.left - right = data.right - closed = data.closed - else: - - # don't allow scalars - if is_scalar(data): - cls._scalar_data_error(data) + with cls._exception_rewrite(): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, + fastpath=fastpath, + verify_integrity=verify_integrity) - data = maybe_convert_platform_interval(data) - left, right, infer_closed = intervals_to_interval_bounds(data) - - if (com._all_not_none(closed, infer_closed) and - closed != infer_closed): - # GH 18421 - msg = ("conflicting values for closed: constructor got " - "'{closed}', inferred from data '{infer_closed}'" - .format(closed=closed, infer_closed=infer_closed)) - raise ValueError(msg) - - closed = closed or infer_closed - - return cls._simple_new(left, right, closed, name, copy=copy, - dtype=dtype, verify_integrity=verify_integrity) + return cls._simple_new(array, name) @classmethod - def _simple_new(cls, left, right, closed=None, name=None, copy=False, - dtype=None, verify_integrity=True): - result = IntervalMixin.__new__(cls) + def _simple_new(cls, array, name, closed=None): + """Construct from an IntervalArray - closed = closed or 'right' - left = _ensure_index(left, copy=copy) - right = _ensure_index(right, copy=copy) - - if dtype is not None: - # GH 19262: dtype must be an IntervalDtype to override inferred - dtype = pandas_dtype(dtype) - if not is_interval_dtype(dtype): - msg = 'dtype must be an IntervalDtype, got {dtype}' - raise TypeError(msg.format(dtype=dtype)) - elif dtype.subtype is not None: - left = left.astype(dtype.subtype) - right = right.astype(dtype.subtype) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ('must not have differing left [{ltype}] and right ' - '[{rtype}] types') - raise ValueError(msg.format(ltype=type(left).__name__, - rtype=type(right).__name__)) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalIndex') - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = 'Period dtypes are not supported, use a PeriodIndex instead' - raise ValueError(msg) - elif (isinstance(left, ABCDatetimeIndex) and - str(left.tz) != str(right.tz)): - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") - raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) - - result._left = left - result._right = right - result._closed = closed + Parameters + ---------- + array : IntervalArray + name : str + Attached as result.name + closed : Any + Ignored. + """ + result = IntervalMixin.__new__(cls) + result._data = array result.name = name - if verify_integrity: - result._validate() result._reset_identity() return result @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): - if left is None: - - # no values passed - left, right = self.left, self.right - - elif right is None: - - # only single value passed, could be an IntervalIndex - # or array of Intervals - if not isinstance(left, IntervalIndex): - left = self._constructor(left) - - left, right = left.left, left.right - else: - - # both left and right are values - pass - + result = self._data._shallow_copy(left=left, right=right) attributes = self._get_attributes_dict() attributes.update(kwargs) - attributes['verify_integrity'] = False - return self._simple_new(left, right, **attributes) - - def _validate(self): - """ - Verify that the IntervalIndex is valid. - """ - if self.closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': {closed}" - .format(closed=self.closed)) - if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') - left_mask = notna(self.left) - right_mask = notna(self.right) - if not (left_mask == right_mask).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') - if not (self.left[left_mask] <= self.right[left_mask]).all(): - raise ValueError('left side of interval must be <= right side') - self._mask = ~left_mask + return self._simple_new(result, **attributes) @cache_readonly def hasnans(self): @@ -412,272 +252,58 @@ def contains(self, key): return False @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from an array of splits - - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - copy the data - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples - """ - breaks = maybe_convert_platform_interval(breaks) - - return cls.from_arrays(breaks[:-1], breaks[1:], closed, - name=name, copy=copy, dtype=dtype) + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) def from_arrays(cls, left, right, closed='right', name=None, copy=False, dtype=None): - """ - Construct from two arrays defining the left and right bounds. - - Parameters - ---------- - left : array-like (1-dimensional) - Left bounds for each interval. - right : array-like (1-dimensional) - Right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - Copy the data. - dtype : dtype, optional - If None, dtype will be inferred. - - .. versionadded:: 0.23.0 - - Returns - ------- - index : IntervalIndex - - Notes - ----- - Each element of `left` must be less than or equal to the `right` - element at the same position. If an element is missing, it must be - missing in both `left` and `right`. A TypeError is raised when - using an unsupported type for `left` or `right`. At the moment, - 'category', 'object', and 'string' subtypes are not supported. - - Raises - ------ - ValueError - When a value is missing in only one of `left` or `right`. - When a value in `left` is greater than the corresponding value - in `right`. - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits. - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples. - - Examples - -------- - >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - If you want to segment different groups of people based on - ages, you can apply the method as follows: - - >>> ages = pd.IntervalIndex.from_arrays([0, 2, 13], - ... [2, 13, 19], closed='left') - >>> ages - IntervalIndex([[0, 2), [2, 13), [13, 19)] - closed='left', - dtype='interval[int64]') - >>> s = pd.Series(['baby', 'kid', 'teen'], ages) - >>> s - [0, 2) baby - [2, 13) kid - [13, 19) teen - dtype: object - - Values may be missing, but they must be missing in both arrays. - - >>> pd.IntervalIndex.from_arrays([0, np.nan, 13], - ... [2, np.nan, 19]) - IntervalIndex([(0.0, 2.0], nan, (13.0, 19.0]] - closed='right', - dtype='interval[float64]') - """ - left = maybe_convert_platform_interval(left) - right = maybe_convert_platform_interval(right) - - return cls._simple_new(left, right, closed, name=name, copy=copy, - dtype=dtype, verify_integrity=True) + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from a 1d array of Interval objects - - .. deprecated:: 0.23.0 - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_intervals([pd.Interval(0, 1), - ... pd.Interval(1, 2)]) - IntervalIndex([(0, 1], (1, 2]] - closed='right', dtype='interval[int64]') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) - IntervalIndex([(0, 1], (1, 2]] - closed='right', dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples - """ msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; use IntervalIndex(...) instead') + 'removed in a future version; Use IntervalIndex(...) instead') warnings.warn(msg, FutureWarning, stacklevel=2) - return cls(data, closed=closed, name=name, copy=copy, dtype=dtype) + with cls._exception_rewrite(): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) def from_tuples(cls, data, closed='right', name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from a list/array of tuples - - Parameters - ---------- - data : array-like (1-dimensional) - Array of tuples - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) - IntervalIndex([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits - """ - if len(data): - left, right = [], [] - else: - left = right = data - - for d in data: - if isna(d): - lhs = rhs = np.nan - else: - try: - # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] - lhs, rhs = d - except ValueError: - msg = ('IntervalIndex.from_tuples requires tuples of ' - 'length 2, got {tpl}').format(tpl=d) - raise ValueError(msg) - except TypeError: - msg = ('IntervalIndex.from_tuples received an invalid ' - 'item, {tpl}').format(tpl=d) - raise TypeError(msg) - left.append(lhs) - right.append(rhs) - - return cls.from_arrays(left, right, closed, name=name, copy=False, - dtype=dtype) - - def to_tuples(self, na_tuple=True): - """ - Return an Index of tuples of the form (left, right) - - Parameters - ---------- - na_tuple : boolean, default True - Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA - value itself if False, ``nan``. - - ..versionadded:: 0.23.0 - + with cls._exception_rewrite(): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + @Appender(_interval_shared_docs['to_tuples'] % dict( + return_type="Index", + examples=""" Examples -------- >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') - """ - tuples = com._asarray_tuplesafe(zip(self.left, self.right)) - if not na_tuple: - # GH 18756 - tuples = np.where(~self._isnan, tuples, np.nan) + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + )) + def to_tuples(self, na_tuple=True): + tuples = self._data.to_tuples(na_tuple=na_tuple) return Index(tuples) @cache_readonly @@ -691,7 +317,7 @@ def left(self): Return the left endpoints of each Interval in the IntervalIndex as an Index """ - return self._left + return self._data._left @property def right(self): @@ -699,7 +325,7 @@ def right(self): Return the right endpoints of each Interval in the IntervalIndex as an Index """ - return self._right + return self._data._right @property def closed(self): @@ -707,7 +333,7 @@ def closed(self): Whether the intervals are closed on the left-side, right-side, both or neither """ - return self._closed + return self._data._closed @property def length(self): @@ -715,23 +341,22 @@ def length(self): Return an Index with entries denoting the length of each Interval in the IntervalIndex """ - try: - return self.right - self.left - except TypeError: - # length not defined for some types, e.g. string - msg = ('IntervalIndex contains Intervals without defined length, ' - 'e.g. Intervals with string endpoints') - raise TypeError(msg) + return self._data.length @property def size(self): - # Avoid materializing self.values - return self.left.size + # Avoid materializing ndarray[Interval] + return self._data.size @property def shape(self): - # Avoid materializing self.values - return self.left.shape + # Avoid materializing ndarray[Interval] + return self._data.shape + + @property + def itemsize(self): + # Avoid materializing ndarray[Interval] + return self._data.itemsize def __len__(self): return len(self.left) @@ -739,13 +364,20 @@ def __len__(self): @cache_readonly def values(self): """ - Return the IntervalIndex's data as a numpy array of Interval - objects (with dtype='object') + Return the IntervalIndex's data as an IntervalArray. """ + return self._data + + @cache_readonly + def _values(self): + return self._data + + @cache_readonly + def _ndarray_values(self): left = self.left right = self.right mask = self._isnan - closed = self._closed + closed = self.closed result = np.empty(len(left), dtype=object) for i in range(len(left)): @@ -757,15 +389,12 @@ def values(self): def __array__(self, result=None): """ the array interface, return my values """ - return self.values + return self._ndarray_values def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result - def _array_values(self): - return self.values - def __reduce__(self): d = dict(left=self.left, right=self.right) @@ -774,30 +403,24 @@ def __reduce__(self): @Appender(_index_shared_docs['copy']) def copy(self, deep=False, name=None): - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right - name = name if name is not None else self.name - closed = self.closed - return type(self).from_arrays(left, right, closed=closed, name=name) + array = self._data.copy(deep=deep) + attributes = self._get_attributes_dict() + if name is not None: + attributes.update(name=name) + + return self._simple_new(array, **attributes) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if is_interval_dtype(dtype) and dtype != self.dtype: - try: - new_left = self.left.astype(dtype.subtype) - new_right = self.right.astype(dtype.subtype) - except TypeError: - msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' - 'incompatible') - raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) - return self._shallow_copy(new_left, new_right) + new_values = self.values.astype(dtype, copy=copy) + if is_interval_dtype(new_values): + return self._shallow_copy(new_values.left, new_values.right) return super(IntervalIndex, self).astype(dtype, copy=copy) @cache_readonly def dtype(self): """Return the dtype object of the underlying data""" - return IntervalDtype.construct_from_string(str(self.left.dtype)) + return self._data.dtype @property def inferred_type(self): @@ -816,11 +439,7 @@ def mid(self): """ Return the midpoint of each Interval in the IntervalIndex as an Index """ - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * self.length + return self._data.mid @cache_readonly def is_monotonic(self): @@ -855,25 +474,7 @@ def is_unique(self): @cache_readonly def is_non_overlapping_monotonic(self): - """ - Return True if the IntervalIndex is non-overlapping (no Intervals share - points) and is either monotonic increasing or monotonic decreasing, - else False - """ - # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) - # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) - # we already require left <= right - - # strict inequality for closed == 'both'; equality implies overlapping - # at a point when both sides of intervals are included - if self.closed == 'both': - return bool((self.right[:-1] < self.left[1:]).all() or - (self.left[:-1] > self.right[1:]).all()) - - # non-strict inequality when closed != 'both'; at least one side is - # not included in the intervals, so equality does not imply overlapping - return bool((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) + return self._data.is_non_overlapping_monotonic @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): @@ -1261,33 +862,10 @@ def _concat_same_dtype(self, to_concat, name): @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) - left, right = self.left, self.right - - if fill_value is None: - fill_value = self._na_value - mask = indices == -1 - - if not mask.any(): - # we won't change dtype here in this case - # if we don't need - allow_fill = False - - taker = lambda x: x.take(indices, allow_fill=allow_fill, - fill_value=fill_value) - - try: - new_left = taker(left) - new_right = taker(right) - except ValueError: - - # we need to coerce; migth have NA's in an - # integer dtype - new_left = taker(left.astype(float)) - new_right = taker(right.astype(float)) - - return self._shallow_copy(new_left, new_right) + result = self._data.take(indices, axis=axis, allow_fill=allow_fill, + fill_value=fill_value, **kwargs) + attributes = self._get_attributes_dict() + return self._simple_new(result, **attributes) def __getitem__(self, value): mask = self._isnan[value] @@ -1347,7 +925,7 @@ def _format_data(self, name=None): tail = [formatter(x) for x in self] summary = '[{tail}]'.format(tail=', '.join(tail)) - return summary + self._format_space() + return summary + ',' + self._format_space() def _format_attrs(self): attrs = [('closed', repr(self.closed))] diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 7edb5b16ce77a..a161413594f95 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -205,7 +205,9 @@ def _hash_categorical(c, encoding, hash_key): ------- ndarray of hashed values array, same size as len(c) """ - hashed = hash_array(c.categories.values, encoding, hash_key, + # Convert ExtensionArrays to ndarrays + values = np.asarray(c.categories.values) + hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values diff --git a/pandas/io/packers.py b/pandas/io/packers.py index f9b1d1574d45c..7002da81cbf1b 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -57,6 +57,7 @@ Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical, CategoricalIndex, IntervalIndex, Interval, TimedeltaIndex) +from pandas.core.arrays import IntervalArray from pandas.core.sparse.api import SparseSeries, SparseDataFrame from pandas.core.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -402,13 +403,17 @@ def encode(obj): u'freq': u_safe(getattr(obj, 'freqstr', None)), u'tz': tz, u'compress': compressor} - elif isinstance(obj, IntervalIndex): - return {u'typ': u'interval_index', + elif isinstance(obj, (IntervalIndex, IntervalArray)): + if isinstance(obj, IntervalIndex): + typ = u'interval_index' + else: + typ = u'interval_array' + return {u'typ': typ, u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), - u'left': getattr(obj, '_left', None), - u'right': getattr(obj, '_right', None), - u'closed': getattr(obj, '_closed', None)} + u'left': getattr(obj, 'left', None), + u'right': getattr(obj, 'right', None), + u'closed': getattr(obj, 'closed', None)} elif isinstance(obj, MultiIndex): return {u'typ': u'multi_index', u'klass': u(obj.__class__.__name__), @@ -610,7 +615,7 @@ def decode(obj): result = result.tz_localize('UTC').tz_convert(tz) return result - elif typ == u'interval_index': + elif typ in (u'interval_index', 'interval_array'): return globals()[obj[u'klass']].from_arrays(obj[u'left'], obj[u'right'], obj[u'closed'], diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index cc833af03ae66..dac4b1117c05f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -546,10 +546,8 @@ def test_basic(self): s = Series(ii, name='A') - # dtypes - # series results in object dtype currently, - assert not is_interval_dtype(s.dtype) - assert not is_interval_dtype(s) + assert is_interval_dtype(s.dtype) + assert is_interval_dtype(s) def test_basic_dtype(self): assert is_interval_dtype('interval[int64]') diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 589134632c7e9..44b818be84e31 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -70,7 +70,6 @@ def test_astype_no_copy(): @pytest.mark.parametrize('dtype', [ dtypes.DatetimeTZDtype('ns', 'US/Central'), dtypes.PeriodDtype("D"), - dtypes.IntervalDtype(), ]) def test_is_not_extension_array_dtype(dtype): assert not isinstance(dtype, dtypes.ExtensionDtype) @@ -79,6 +78,7 @@ def test_is_not_extension_array_dtype(dtype): @pytest.mark.parametrize('dtype', [ dtypes.CategoricalDtype(), + dtypes.IntervalDtype(), ]) def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py new file mode 100644 index 0000000000000..a0893b3fa0d44 --- /dev/null +++ b/pandas/tests/extension/test_interval.py @@ -0,0 +1,122 @@ +import pytest +import numpy as np + +from pandas import Interval +from pandas.core.arrays import IntervalArray +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.tests.extension import base +import pandas.util.testing as tm + + +def make_data(): + N = 100 + left = np.random.uniform(size=N).cumsum() + right = left + np.random.uniform(size=N) + return [Interval(l, r) for l, r in zip(left, right)] + + +@pytest.fixture +def dtype(): + return IntervalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return IntervalArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return IntervalArray.from_tuples([None, (0, 1)]) + + +@pytest.fixture +def data_for_sorting(): + return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)]) + + +@pytest.fixture +def data_missing_for_sorting(): + return IntervalArray.from_tuples([(1, 2), None, (0, 1)]) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(): + a = (0, 1) + b = (1, 2) + c = (2, 3) + return IntervalArray.from_tuples([b, b, None, None, a, a, b, c]) + + +class BaseInterval(object): + pass + + +class TestDtype(BaseInterval, base.BaseDtypeTests): + pass + + +class TestCasting(BaseInterval, base.BaseCastingTests): + pass + + +class TestConstructors(BaseInterval, base.BaseConstructorsTests): + pass + + +class TestGetitem(BaseInterval, base.BaseGetitemTests): + pass + + +class TestGrouping(BaseInterval, base.BaseGroupbyTests): + pass + + +class TestInterface(BaseInterval, base.BaseInterfaceTests): + pass + + +class TestMethods(BaseInterval, base.BaseMethodsTests): + pass + + +class TestMissing(BaseInterval, base.BaseMissingTests): + # Index.fillna only accepts scalar `value`, so we have to skip all + # non-scalar fill tests. + unsupported_fill = pytest.mark.skip("Unsupported fillna option.") + + @unsupported_fill + def test_fillna_limit_pad(self): + pass + + @unsupported_fill + def test_fillna_series_method(self): + pass + + @unsupported_fill + def test_fillna_limit_backfill(self): + pass + + @unsupported_fill + def test_fillna_series(self): + pass + + def test_non_scalar_raises(self, data_missing): + msg = "Got a 'list' instead of a 'pandas.Interval'." + with tm.assert_raises_regex(TypeError, msg): + data_missing.fillna([1, 1]) + + +class TestReshaping(BaseInterval, base.BaseReshapingTests): + pass + + +def test_repr(): + idx = pd.interval_range(0, 4) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 758f3f0ef9ebc..b2788e4f38b90 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -886,7 +886,7 @@ def test_hasnans_isnans(self): assert not idx.hasnans idx = index.copy() - values = idx.values + values = np.asarray(idx.values) if len(index) == 0: continue @@ -928,7 +928,7 @@ def test_fillna(self): idx.fillna([idx[0]]) idx = index.copy() - values = idx.values + values = np.asarray(idx.values) if isinstance(index, DatetimeIndexOpsMixin): values[1] = iNaT diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index b3a4bfa878c3f..b706282ed1dac 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -51,7 +51,7 @@ def test_astype_category(self, index): 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]', 'datetime64[ns, US/Eastern]']) def test_astype_cannot_cast(self, index, dtype): - msg = 'Cannot cast IntervalIndex to dtype' + msg = 'Cannot cast IntervalArray to dtype' with tm.assert_raises_regex(TypeError, msg): index.astype(dtype) diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 5fdf92dcb2044..a1aa5c896663f 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -78,7 +78,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result.values, expected_values) + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) @pytest.mark.parametrize('breaks', [ [], @@ -97,7 +97,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result.values, expected_values) + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) @pytest.mark.parametrize('breaks', [ tuple('0123456789'), diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 71a6f78125004..a441d206e5e72 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -55,7 +55,6 @@ def test_properties(self, closed): ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) - tm.assert_numpy_array_equal(index.values, expected) # with nans index = self.create_index_with_nan(closed=closed) @@ -76,7 +75,6 @@ def test_properties(self, closed): for l, r in zip(expected_left, expected_right)] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) - tm.assert_numpy_array_equal(index.values, expected) @pytest.mark.parametrize('breaks', [ [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], @@ -141,7 +139,7 @@ def test_ensure_copied_data(self, closed): check_same='same') # by-definition make a copy - result = IntervalIndex(index.values, copy=False) + result = IntervalIndex(index._ndarray_values, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='copy') tm.assert_numpy_array_equal(index.right.values, result.right.values, @@ -964,3 +962,21 @@ def test_to_tuples_na(self, tuples, na_tuple): assert all(isna(x) for x in result_na) else: assert isna(result_na) + + def test_nbytes(self): + # GH 19209 + left = np.arange(0, 4, dtype='i8') + right = np.arange(1, 5, dtype='i8') + + result = IntervalIndex.from_arrays(left, right).nbytes + expected = 64 # 4 * 8 * 2 + assert result == expected + + def test_itemsize(self): + # GH 19209 + left = np.arange(0, 4, dtype='i8') + right = np.arange(1, 5, dtype='i8') + + result = IntervalIndex.from_arrays(left, right).itemsize + expected = 16 # 8 * 2 + assert result == expected diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index c4c02c0bf6f17..888d06b67effe 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1194,7 +1194,7 @@ def test_iter_box(self): 'datetime64[ns, US/Central]'), (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, 'interval'), ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values @@ -1208,6 +1208,8 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_index_equal(l_values, r_values) elif pd.api.types.is_categorical(l_values): tm.assert_categorical_equal(l_values, r_values) + elif pd.api.types.is_interval_dtype(l_values): + tm.assert_interval_array_equal(l_values, r_values) else: raise TypeError("Unexpected type {}".format(type(l_values))) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 1c878604b11a2..86b123a862a0c 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import textwrap import pandas as pd import pytest import numpy as np @@ -717,6 +718,21 @@ def test_categorical_equal_message(self): tm.assert_categorical_equal(a, b) +class TestAssertIntervalArrayEqual(object): + def test_interval_array_equal_message(self): + a = pd.interval_range(0, periods=4).values + b = pd.interval_range(1, periods=4).values + + msg = textwrap.dedent("""\ + IntervalArray.left are different + + IntervalArray.left values are different \\(100.0 %\\) + \\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) + \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""") + with tm.assert_raises_regex(AssertionError, msg): + tm.assert_interval_array_equal(a, b) + + class TestRNGContext(object): def test_RNGContext(self): diff --git a/pandas/util/exceptions.py b/pandas/util/exceptions.py new file mode 100644 index 0000000000000..953c8a43a21b8 --- /dev/null +++ b/pandas/util/exceptions.py @@ -0,0 +1,16 @@ +import contextlib + + +@contextlib.contextmanager +def rewrite_exception(old_name, new_name): + """Rewrite the message of an exception.""" + try: + yield + except Exception as e: + msg = e.args[0] + msg = msg.replace(old_name, new_name) + args = (msg,) + if len(e.args) > 1: + args = args + e.args[1:] + e.args = args + raise diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6e13a17eba68c..45bb091238a63 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -20,7 +20,7 @@ import numpy as np import pandas as pd -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, IntervalArray from pandas.core.dtypes.missing import array_equivalent from pandas.core.dtypes.common import ( is_datetimelike_v_numeric, @@ -849,7 +849,7 @@ def _get_ilevel_values(index, level): assert_attr_equal('freq', left, right, obj=obj) if (isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex)): - assert_attr_equal('closed', left, right, obj=obj) + assert_interval_array_equal(left.values, right.values) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): @@ -987,6 +987,31 @@ def assert_categorical_equal(left, right, check_dtype=True, assert_attr_equal('ordered', left, right, obj=obj) +def assert_interval_array_equal(left, right, exact='equiv', + obj='IntervalArray'): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool / string {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, + obj='{obj}.left'.format(obj=obj)) + assert_index_equal(left.right, right.right, exact=exact, + obj='{obj}.left'.format(obj=obj)) + assert_attr_equal('closed', left, right, obj=obj) + + def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) @@ -1206,11 +1231,7 @@ def assert_series_equal(left, right, check_dtype=True, assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) elif is_interval_dtype(left) or is_interval_dtype(right): - # TODO: big hack here - left = pd.IntervalIndex(left) - right = pd.IntervalIndex(right) - assert_index_equal(left, right, obj='{obj}.index'.format(obj=obj)) - + assert_interval_array_equal(left.values, right.values) else: _testing.assert_almost_equal(left.get_values(), right.get_values(), check_less_precise=check_less_precise, From 9e5fc500d0fb7abd3e270771b138cce905923430 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 12:06:10 -0500 Subject: [PATCH 02/22] fixup! REF: IntervalIndex[IntervalArray] --- pandas/core/indexes/interval.py | 3 ++- pandas/tests/indexes/interval/test_astype.py | 2 +- pandas/tests/test_base.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c5ece41e0265c..6e13cc29638f6 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -412,7 +412,8 @@ def copy(self, deep=False, name=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - new_values = self.values.astype(dtype, copy=copy) + with rewrite_exception('IntervalArray', self.__class__.__name__): + new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) return super(IntervalIndex, self).astype(dtype, copy=copy) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index b706282ed1dac..b3a4bfa878c3f 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -51,7 +51,7 @@ def test_astype_category(self, index): 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]', 'datetime64[ns, US/Eastern]']) def test_astype_cannot_cast(self, index, dtype): - msg = 'Cannot cast IntervalArray to dtype' + msg = 'Cannot cast IntervalIndex to dtype' with tm.assert_raises_regex(TypeError, msg): index.astype(dtype) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 888d06b67effe..191b716cd7fff 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1194,7 +1194,8 @@ def test_iter_box(self): 'datetime64[ns, US/Central]'), (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), - (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, 'interval'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, + 'interval'), ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values From abb8a451edfb31518d88020c042abfeb9e7a986f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 13:10:25 -0500 Subject: [PATCH 03/22] fixup! fixup! REF: IntervalIndex[IntervalArray] --- pandas/core/arrays/interval.py | 4 ---- pandas/tests/extension/test_interval.py | 9 ++++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b793eb5a59aae..32a89c4b07ee3 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -704,10 +704,6 @@ def size(self): def shape(self): return self.left.shape - @property - def itemsize(self): - return self.left.itemsize + self.right.itemsize - @property def is_non_overlapping_monotonic(self): """ diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index a0893b3fa0d44..6a1e76b10205e 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from pandas import Interval +from pandas import Interval, IntervalIndex from pandas.core.arrays import IntervalArray from pandas.core.dtypes.dtypes import IntervalDtype from pandas.tests.extension import base @@ -118,5 +118,8 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): pass -def test_repr(): - idx = pd.interval_range(0, 4) +def test_repr_matches(): + idx = IntervalIndex.from_breaks([1, 2, 3]) + a = repr(idx) + b = repr(idx.values) + assert a.replace("Index", "Array") == b From 4e48e8879a9a68095ff3c6ea79b2dd40de8bca8c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 13:40:55 -0500 Subject: [PATCH 04/22] Py2 compat Writable docstrings. Removed lambda as class attribute. --- pandas/compat/__init__.py | 8 ++++++++ pandas/core/arrays/interval.py | 2 ++ pandas/core/indexes/interval.py | 10 +++++----- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12517372fedd1..eccb7ab2ad51f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,3 +451,11 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + + +class _WritableDoc(type): + # Remove this when Python2 support is dropped + # __doc__ is not mutable for new-style classes in Python2, which means + # we can't use @Appender to share class docstrings. This can be used + # with `add_metaclass` to make cls.__doc__ mutable. + pass diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 32a89c4b07ee3..cbe862b0a878e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -3,6 +3,7 @@ from pandas._libs.interval import (Interval, IntervalMixin, intervals_to_interval_bounds) +from pandas.compat import add_metaclass, _WritableDoc from pandas.compat.numpy import function as nv from pandas.core.common import _all_not_none, _asarray_tuplesafe from pandas.core.config import get_option @@ -95,6 +96,7 @@ versionadded="0.23.0", name='', extra_methods='', examples='', )) +@add_metaclass(_WritableDoc) class IntervalArray(IntervalMixin, ExtensionArray): dtype = IntervalDtype() ndim = 1 diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6e13cc29638f6..738e3d896d477 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.compat import add_metaclass, _WritableDoc from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( @@ -130,13 +131,12 @@ def _new_IntervalIndex(cls, d): """), )) +@add_metaclass(_WritableDoc) class IntervalIndex(IntervalMixin, Index): _typ = 'intervalindex' _comparables = ['name'] _attributes = ['name', 'closed'] _allow_index_ops = True - _exception_rewrite = lambda: rewrite_exception('IntervalArray', - 'IntervalIndex') # we would like our indexing holder to defer to us _defer_to_indexing = True @@ -153,7 +153,7 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, if name is None and hasattr(data, 'name'): name = data.name - with cls._exception_rewrite(): + with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, fastpath=fastpath, verify_integrity=verify_integrity) @@ -274,7 +274,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, msg = ('IntervalIndex.from_intervals is deprecated and will be ' 'removed in a future version; Use IntervalIndex(...) instead') warnings.warn(msg, FutureWarning, stacklevel=2) - with cls._exception_rewrite(): + with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) if name is None and isinstance(data, cls): @@ -286,7 +286,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) def from_tuples(cls, data, closed='right', name=None, copy=False, dtype=None): - with cls._exception_rewrite(): + with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) From 11d97dbf1405ae5338d4f6e4676716541239a756 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 13:51:46 -0500 Subject: [PATCH 05/22] DOC: Added note about missing comma --- doc/source/whatsnew/v0.23.0.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3f04743c75347..93d3618fac7fc 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -305,7 +305,7 @@ Storing Interval Data in Series and DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Interval data may now be stored in a Series or DataFrame, in addition to an -:class:`IntervalIndex` like before. +:class:`IntervalIndex` like before (:issue:`19453`). .. ipython:: python @@ -520,7 +520,8 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``.values`` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects. +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects +(:issue:`19453`). Previous Behavior: @@ -1133,6 +1134,7 @@ Indexing - Bug in ``Index`` subclasses constructors that ignore unexpected keyword arguments (:issue:`19348`) - Bug in :meth:`Index.difference` when taking difference of an ``Index`` with itself (:issue:`20040`) - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` in presence of entire rows of NaNs in the middle of values (:issue:`20499`). +- Bug in the ``IntervalIndex`` repr missing a trailing comma at the end of the "data" section (:issue`20611`) MultiIndex ^^^^^^^^^^ From de96a61c4d46cc7ca7e4249bd4a0e63bc68b549b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 15:29:18 -0500 Subject: [PATCH 06/22] Added IntervalArray.__setitem__ --- pandas/core/arrays/interval.py | 20 ++++++++++ pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/setitem.py | 38 +++++++++++++++++++ .../extension/category/test_categorical.py | 4 ++ .../tests/extension/decimal/test_decimal.py | 4 ++ pandas/tests/extension/test_interval.py | 4 ++ 6 files changed, 71 insertions(+) create mode 100644 pandas/tests/extension/base/setitem.py diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cbe862b0a878e..fbaf05a8ce1fd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -456,6 +456,26 @@ def __getitem__(self, value): return self._shallow_copy(left, right) + def __setitem__(self, key, value): + if not (is_interval_dtype(value) or isinstance(value, Interval)): + msg = "'value' should be an interval type, got {} instead." + raise TypeError(msg.format(type(value))) + + if value.closed != self.closed: + msg = "'value.closed' ({}) does not match {}." + raise ValueError(value.closed, self.closed) + + # Need to ensure that left and right are updated atomically, so we're + # forced to copy, update the copy, and swap in the new values. + left = self.left.copy(deep=True) + right = self.right.copy(deep=True) + + left.values[key] = value.left + right.values[key] = value.right + + self._left = left + self._right = right + def fillna(self, value=None, method=None, limit=None): if method is not None: raise TypeError('Filling by method is not supported for ' diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index f8078d2798b32..9da985625c4ee 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -49,3 +49,4 @@ class TestMyDtype(BaseDtypeTests): from .methods import BaseMethodsTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa +from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py new file mode 100644 index 0000000000000..0d183177ce75e --- /dev/null +++ b/pandas/tests/extension/base/setitem.py @@ -0,0 +1,38 @@ +import pytest + +from .base import BaseExtensionTests + + +class BaseSetitemTests(BaseExtensionTests): + """Tests for ExtensionArray.__setitem__""" + + def test_set_scalar(self, data): + expected = data.take([1, 1]) + subset = data[:2].copy() + + subset[0] = data[1] + self.assert_extension_array_equal(subset, expected) + + def test_set_mask_scalar(self, data): + expected = data.take([1, 1, 2, 1]) + subset = data[:4].copy() + + subset[[True, True, False, True]] = data[1] + self.assert_extension_array_equal(subset, expected) + + @pytest.mark.parametrize('key', [ + [False, True, True, True], + [1, 2, 3], + ], ids=['mask', 'fancy']) + def test_set_array(self, key, data): + expected = data.take([0, 2, 2, 1]) + value = data.take([2, 2, 1]) + subset = data[:4].copy() + + subset[key] = value + self.assert_extension_array_equal(subset, expected) + + def test_bad_mask_bad_length_raise(self, data): + value = data[0] + with pytest.raises(IndexError): + data[[True, False]] = value diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 7528299578326..32b44c943317b 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -106,3 +106,7 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): pass + + +class TestSetitem(base.BaseSetitemTests): + pass diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index d509170565e1a..5c3536ecd8843 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -131,6 +131,10 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): pass +class TestSetitem(BaseDecimal, base.BaseSetitemTests): + pass + + def test_series_constructor_coerce_data_to_extension_dtype_raises(): xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " "extension array directly.") diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6a1e76b10205e..c10cf0490f69b 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -118,6 +118,10 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): pass +class TestSetitem(BaseInterval, base.BaseSetitemTests): + pass + + def test_repr_matches(): idx = IntervalIndex.from_breaks([1, 2, 3]) a = repr(idx) From 24db2227e54d9be4b046a0ddb8172cbb7504ff44 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Apr 2018 16:59:13 -0500 Subject: [PATCH 07/22] Fixed linting --- pandas/core/arrays/interval.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index fbaf05a8ce1fd..c819c6efa2bb8 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -5,7 +5,7 @@ intervals_to_interval_bounds) from pandas.compat import add_metaclass, _WritableDoc from pandas.compat.numpy import function as nv -from pandas.core.common import _all_not_none, _asarray_tuplesafe +import pandas.core.common as com from pandas.core.config import get_option from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import (_ensure_platform_int, @@ -130,7 +130,8 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds(data) - if _all_not_none(closed, infer_closed) and closed != infer_closed: + if (com._all_not_none(closed, infer_closed) and + closed != infer_closed): # GH 18421 msg = ("conflicting values for closed: constructor got " "'{closed}', inferred from data '{infer_closed}'" @@ -790,7 +791,7 @@ def values(self): examples='', )) def to_tuples(self, na_tuple=True): - tuples = _asarray_tuplesafe(zip(self.left, self.right)) + tuples = com._asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) From d2bb35a7941f997bc0da07f00da42a22b209abef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Apr 2018 14:41:10 -0500 Subject: [PATCH 08/22] Fix for Joris --- pandas/core/arrays/interval.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c819c6efa2bb8..287ad727d57e9 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -438,7 +438,7 @@ def _validate(self): # Interface # --------- def __iter__(self): - return iter(self.values) + return iter(np.asarray(self)) def __len__(self): return len(self.left) @@ -517,10 +517,10 @@ def astype(self, dtype, copy=True): else: return self elif is_categorical_dtype(dtype): - return Categorical(self.values) + return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. try: - return self.values.astype(dtype, copy=copy) + return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) @@ -571,10 +571,7 @@ def copy(self, deep=False): return type(self).from_arrays(left, right, closed=closed) def _formatting_values(self): - return self.values - - def get_values(self): - return self.values + return np.asarray(self) def isna(self): return isna(self.left) @@ -617,8 +614,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, return self._shallow_copy(new_left, new_right) - take_nd = take - # Formatting def _format_data(self): @@ -750,8 +745,7 @@ def is_non_overlapping_monotonic(self): (self.left[:-1] >= self.right[1:]).all()) # Conversion - @property - def values(self): + def __array__(self, dtype=None): """ Return the IntervalIndex's data as a numpy array of Interval objects (with dtype='object') @@ -797,17 +791,6 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples - def tolist(self): - """ - Return a list of Interval objects. - - See Also - -------- - numpy.ndarray.tolist - """ - # TODO: think about putting this in a parent - return self.values.tolist() - def repeat(self, repeats): """Repeat elements of an IntervalArray From f22b453b80588330aa24c6dd77e9ad334fda930d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Apr 2018 07:23:26 -0500 Subject: [PATCH 09/22] Update for comments --- doc/source/whatsnew/v0.23.0.txt | 6 ++-- pandas/_libs/interval.pyx | 20 +++++++++++ pandas/core/arrays/categorical.py | 3 ++ pandas/core/arrays/interval.py | 44 +++++-------------------- pandas/core/indexes/interval.py | 1 - pandas/tests/extension/test_interval.py | 2 +- 6 files changed, 36 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 93d3618fac7fc..e3df2e9309f11 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -299,7 +299,7 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist df['A'].dtype df['B'].dtype -.. _whatsnew_023.enhancements.interval: +.. _whatsnew_0230.enhancements.interval: Storing Interval Data in Series and DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -334,7 +334,7 @@ To recover the NumPy array of Interval objects, use :func:`numpy.asarray`: This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0230.api_breaking.interval_values` for more. -.. _whatsnew_023.enhancements.extension: +.. _whatsnew_0230.enhancements.extension: Extending Pandas with Custom Types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -542,7 +542,7 @@ New Behavior: idx = pd.interval_range(0, 4) idx.values -This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. +This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. For situations where you need an ``ndarray`` of Interval objects, use :meth:`numpy.asarray` or ``idx.astype(object)``. diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 5dbf509fda65e..9e0e358a93c2f 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -98,6 +98,26 @@ cdef class IntervalMixin(object): msg = 'cannot compute length between {left!r} and {right!r}' raise TypeError(msg.format(left=self.left, right=self.right)) + def _check_closed_matches(self, other, name='other'): + """Check if the closed attribute of `other` matches. + + Note that 'left' and 'right' are considered different from 'both'. + + Parameters + ---------- + other : Interval, IntervalIndex, IntervalArray + name : str + Name to use for 'other' in the error message. + + Raises + ------ + ValueError + When `other` is not closed exactly the same as self. + """ + if self.closed != other.closed: + msg = "'{}.closed' is '{}', expected '{}'." + raise ValueError(msg.format(name, other.closed, self.closed)) + cdef _interval_like(other): return (hasattr(other, 'left') diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 086bf231cb4cb..ccafc982001ae 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1220,6 +1220,9 @@ def __array__(self, dtype=None): if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) if is_extension_array_dtype(ret): + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ get's all the way to an + # ndarray. ret = np.asarray(ret) return ret diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 287ad727d57e9..d833c0b89defb 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -15,7 +15,8 @@ pandas_dtype) from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, - ABCSeries) + ABCSeries, ABCIntervalIndex, + ABCInterval) from pandas.core.dtypes.missing import isna, notna from pandas.core.indexes.base import Index, _ensure_index from pandas.util._decorators import Appender @@ -106,8 +107,6 @@ class IntervalArray(IntervalMixin, ExtensionArray): def __new__(cls, data, closed=None, dtype=None, copy=False, fastpath=False, verify_integrity=True): - from pandas.core.indexes.interval import IntervalIndex - if fastpath: return cls._simple_new(data.left, data.right, closed, copy=copy, dtype=dtype, @@ -115,7 +114,7 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values - if isinstance(data, (cls, IntervalIndex)): + if isinstance(data, (cls, ABCIntervalIndex)): left = data.left right = data.right closed = data.closed @@ -458,7 +457,7 @@ def __getitem__(self, value): return self._shallow_copy(left, right) def __setitem__(self, key, value): - if not (is_interval_dtype(value) or isinstance(value, Interval)): + if not (is_interval_dtype(value) or isinstance(value, ABCInterval)): msg = "'value' should be an interval type, got {} instead." raise TypeError(msg.format(type(value))) @@ -484,13 +483,14 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError('limit is not supported for IntervalArray.') - if not isinstance(value, Interval): - msg = ("Interval.fillna only supports filling with scalar " - "'value'. Got a '{}' instead of a 'pandas.Interval'." + if not isinstance(value, ABCInterval): + msg = ("'Interval.fillna' only supports filling with a scalar " + "'pandas.Interval'. Got a '{}' instead." .format(type(value).__name__)) raise TypeError(msg) value = getattr(value, '_values', value) + self._check_closed_matches(value, name="value") left = self.left.fillna(value=value.left) right = self.right.fillna(value=value.right) @@ -538,8 +538,6 @@ def _concat_same_type(cls, to_concat): return cls._simple_new(left, right, closed=closed, copy=False) def _shallow_copy(self, left=None, right=None): - from pandas.core.indexes.interval import IntervalIndex - if left is None: # no values passed @@ -550,7 +548,7 @@ def _shallow_copy(self, left=None, right=None): # only single value passed, could be an IntervalIndex # or array of Intervals - if not isinstance(left, (type(self), IntervalIndex)): + if not isinstance(left, (type(self), ABCIntervalIndex)): left = type(self)(left) left, right = left.left, left.right @@ -791,30 +789,6 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples - def repeat(self, repeats): - """Repeat elements of an IntervalArray - - Parameters - ---------- - repeats : int - Number of repetitions for each element. - - Returns - ------- - IntervalArray - - See Also - -------- - numpy.repeat - """ - return self._simple_new( - self.left.repeat(repeats), - self.right.repeat(repeats), - closed=self.closed - ) - -# TODO: find a home - def maybe_convert_platform_interval(values): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 738e3d896d477..ba1ddef46fb1f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -136,7 +136,6 @@ class IntervalIndex(IntervalMixin, Index): _typ = 'intervalindex' _comparables = ['name'] _attributes = ['name', 'closed'] - _allow_index_ops = True # we would like our indexing holder to defer to us _defer_to_indexing = True diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index c10cf0490f69b..e9c13023e398f 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -109,7 +109,7 @@ def test_fillna_series(self): pass def test_non_scalar_raises(self, data_missing): - msg = "Got a 'list' instead of a 'pandas.Interval'." + msg = "Got a 'list' instead." with tm.assert_raises_regex(TypeError, msg): data_missing.fillna([1, 1]) From 097b9a4d4f161bedae48a1a3f71a627f9a1e2594 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 6 Jul 2018 10:32:19 -0600 Subject: [PATCH 10/22] Allow setting NA --- pandas/core/arrays/interval.py | 40 ++++++++++++++++++------- pandas/tests/extension/test_interval.py | 21 +++++++++++-- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d833c0b89defb..3c6f09bd194ab 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -12,6 +12,8 @@ is_categorical_dtype, is_float_dtype, is_integer_dtype, is_interval_dtype, is_scalar, is_string_dtype, + is_datetime64_any_dtype, + is_timedelta64_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, @@ -457,23 +459,41 @@ def __getitem__(self, value): return self._shallow_copy(left, right) def __setitem__(self, key, value): - if not (is_interval_dtype(value) or isinstance(value, ABCInterval)): + # need special casing to set directly on numpy arrays + _needs_float_conversion = False + if is_scalar(value) and isna(value): + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + _needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64('NaT') + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64('NaT') + value_left, value_right = value, value + elif is_interval_dtype(value) or isinstance(value, ABCInterval): + if value.closed != self.closed: + msg = "'value.closed' ({}) does not match {}." + raise ValueError(msg.format(value.closed, self.closed)) + value_left, value_right = value.left, value.right + else: + # wrong type: not interval or NA msg = "'value' should be an interval type, got {} instead." raise TypeError(msg.format(type(value))) - if value.closed != self.closed: - msg = "'value.closed' ({}) does not match {}." - raise ValueError(value.closed, self.closed) - # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) - right = self.right.copy(deep=True) - - left.values[key] = value.left - right.values[key] = value.right - + if _needs_float_conversion: + left = left.astype('float') + left.values[key] = value_left self._left = left + + right = self.right.copy(deep=True) + if _needs_float_conversion: + right = right.astype('float') + right.values[key] = value_right self._right = right def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index e9c13023e398f..5f08f1469b12b 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from pandas import Interval, IntervalIndex +from pandas import Interval, IntervalIndex, date_range, timedelta_range from pandas.core.arrays import IntervalArray from pandas.core.dtypes.dtypes import IntervalDtype from pandas.tests.extension import base @@ -119,7 +119,24 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - pass + + @pytest.mark.parametrize('left, right', [ + (np.arange(3.0), np.arange(1.0, 4.0)), + ([0, 2, 4], [1, 3, 5]), + (timedelta_range('0 days', periods=3), + timedelta_range('1 day', periods=3)), + (date_range('20170101', periods=3), date_range('20170102', periods=3)), + pytest.param(date_range('20170101', periods=3, tz='US/Eastern'), + date_range('20170102', periods=3, tz='US/Eastern'), + marks=pytest.mark.xfail(reason='fixed after rebase?'))]) + def test_set_na(self, left, right): + result = IntervalArray.from_arrays(left, right) + result[0] = np.nan + + expected = IntervalArray.from_arrays( + [np.nan] + list(left[1:]), [np.nan] + list(right[1:])) + + self.assert_extension_array_equal(result, expected) def test_repr_matches(): From c2bca65c2a572404bad3f4427a44a1d6d4faf755 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 6 Jul 2018 13:29:09 -0600 Subject: [PATCH 11/22] review edits --- pandas/compat/__init__.py | 8 -------- pandas/core/arrays/interval.py | 28 ++++++++++++++++------------ pandas/core/indexes/interval.py | 3 ++- pandas/util/_doctools.py | 8 ++++++++ 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index eccb7ab2ad51f..12517372fedd1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,11 +451,3 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 - - -class _WritableDoc(type): - # Remove this when Python2 support is dropped - # __doc__ is not mutable for new-style classes in Python2, which means - # we can't use @Appender to share class docstrings. This can be used - # with `add_metaclass` to make cls.__doc__ mutable. - pass diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3c6f09bd194ab..3502a35e1cd66 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -3,7 +3,7 @@ from pandas._libs.interval import (Interval, IntervalMixin, intervals_to_interval_bounds) -from pandas.compat import add_metaclass, _WritableDoc +from pandas.compat import add_metaclass from pandas.compat.numpy import function as nv import pandas.core.common as com from pandas.core.config import get_option @@ -22,6 +22,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.indexes.base import Index, _ensure_index from pandas.util._decorators import Appender +from pandas.util._doctools import _WritableDoc from . import ExtensionArray, Categorical @@ -56,7 +57,7 @@ dtype : dtype or None, default None If None, dtype will be inferred - .. versoinadded:: 0.23.0 + .. versionadded:: 0.23.0 Attributes ---------- @@ -96,7 +97,7 @@ @Appender(_interval_shared_docs['class'] % dict( klass="IntervalArray", summary="Pandas array for interval data that are closed on the same side", - versionadded="0.23.0", + versionadded="0.24.0", name='', extra_methods='', examples='', )) @add_metaclass(_WritableDoc) @@ -116,6 +117,7 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values + if isinstance(data, (cls, ABCIntervalIndex)): left = data.left right = data.right @@ -128,6 +130,7 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, " {} was passed") raise TypeError(msg.format(cls.__name__, data)) + # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds(data) @@ -387,6 +390,7 @@ def from_tuples(cls, data, closed='right', copy=False, dtype=None): if len(data): left, right = [], [] else: + # ensure that empty data keeps input dtype left = right = data for d in data: @@ -598,6 +602,15 @@ def isna(self): def nbytes(self): return self.left.nbytes + self.right.nbytes + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + return self.left.shape + @property def itemsize(self): return self.left.itemsize + self.right.itemsize @@ -731,15 +744,6 @@ def mid(self): # datetime safe version return self.left + 0.5 * self.length - @property - def size(self): - # Avoid materializing self.values - return self.left.size - - @property - def shape(self): - return self.left.shape - @property def is_non_overlapping_monotonic(self): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ba1ddef46fb1f..15a46ab774caf 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -4,7 +4,7 @@ import numpy as np -from pandas.compat import add_metaclass, _WritableDoc +from pandas.compat import add_metaclass from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( @@ -34,6 +34,7 @@ from pandas.core.indexes.multi import MultiIndex import pandas.core.common as com from pandas.util._decorators import cache_readonly, Appender +from pandas.util._doctools import _WritableDoc from pandas.util.exceptions import rewrite_exception from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 667c5d9526563..c9e6e27363aed 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -163,6 +163,14 @@ def _make_table(self, ax, df, title, height=None): ax.axis('off') +class _WritableDoc(type): + # Remove this when Python2 support is dropped + # __doc__ is not mutable for new-style classes in Python2, which means + # we can't use @Appender to share class docstrings. This can be used + # with `add_metaclass` to make cls.__doc__ mutable. + pass + + if __name__ == "__main__": import matplotlib.pyplot as plt From 39249a3b43c7bc609bda7980edc147376464685f Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 6 Jul 2018 13:34:19 -0600 Subject: [PATCH 12/22] implement IntervalArray.repeat to fix failing tests --- pandas/core/arrays/interval.py | 31 +++++++++++++++++++++++++ pandas/tests/extension/test_interval.py | 21 ++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3502a35e1cd66..5f977cadaf8bf 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -813,6 +813,37 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples + def repeat(self, repeats, **kwargs): + """ + Repeat elements of an IntervalArray. + + Returns a new IntervalArray where each element of the current + IntervalArray is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int + The number of repetitions for each element. + + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + IntervalArray + Newly created IntervalArray with repeated elements. + + See Also + -------- + Index.repeat : Equivalent function for Index + Series.repeat : Equivalent function for Series + numpy.repeat : Underlying implementation + """ + left_repeat = self.left.repeat(repeats, **kwargs) + right_repeat = self.right.repeat(repeats, **kwargs) + return self._shallow_copy(left=left_repeat, right=right_repeat) + def maybe_convert_platform_interval(values): """ diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 5f08f1469b12b..6e95b55bffa8d 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -84,7 +84,26 @@ class TestInterface(BaseInterval, base.BaseInterfaceTests): class TestMethods(BaseInterval, base.BaseMethodsTests): - pass + @pytest.mark.parametrize('repeats', [0, 1, 5]) + @pytest.mark.parametrize('left, right', [ + (np.arange(3.0), np.arange(1.0, 4.0)), + ([0, 2, 4], [1, 3, 5]), + (timedelta_range('0 days', periods=3), + timedelta_range('1 day', periods=3)), + (date_range('20170101', periods=3), date_range('20170102', periods=3)), + pytest.param(date_range('20170101', periods=3, tz='US/Eastern'), + date_range('20170102', periods=3, tz='US/Eastern'), + marks=pytest.mark.xfail(reason='fixed after rebase?'))]) + def test_repeats(self, left, right, repeats): + array = IntervalArray.from_arrays(left, right) + result = array.repeat(repeats) + + left_reps = [x for x in left for _ in range(repeats)] + right_reps = [x for x in right for _ in range(repeats)] + expected = IntervalArray.from_arrays( + left_reps, right_reps, dtype=array.dtype) + + tm.assert_extension_array_equal(result, expected) class TestMissing(BaseInterval, base.BaseMissingTests): From f82df728edbfa00fe3bb4981208500b907491a12 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 7 Jul 2018 10:05:52 -0500 Subject: [PATCH 13/22] cleanup tests --- pandas/tests/extension/test_interval.py | 64 +++++++++++++------------ 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 6e95b55bffa8d..337dd696becf7 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from pandas import Interval, IntervalIndex, date_range, timedelta_range +from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range from pandas.core.arrays import IntervalArray from pandas.core.dtypes.dtypes import IntervalDtype from pandas.tests.extension import base @@ -15,6 +15,22 @@ def make_data(): return [Interval(l, r) for l, r in zip(left, right)] +@pytest.fixture(params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0., 1., 2.]), Index([1., 2., 3.])), + (timedelta_range('0 days', periods=3), + timedelta_range('1 day', periods=3)), + (date_range('20170101', periods=3), date_range('20170102', periods=3)), + (date_range('20170101', periods=3, tz='US/Eastern'), + date_range('20170102', periods=3, tz='US/Eastern'))], + ids=lambda x: str(x[0].dtype)) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + @pytest.fixture def dtype(): return IntervalDtype() @@ -85,26 +101,21 @@ class TestInterface(BaseInterval, base.BaseInterfaceTests): class TestMethods(BaseInterval, base.BaseMethodsTests): @pytest.mark.parametrize('repeats', [0, 1, 5]) - @pytest.mark.parametrize('left, right', [ - (np.arange(3.0), np.arange(1.0, 4.0)), - ([0, 2, 4], [1, 3, 5]), - (timedelta_range('0 days', periods=3), - timedelta_range('1 day', periods=3)), - (date_range('20170101', periods=3), date_range('20170102', periods=3)), - pytest.param(date_range('20170101', periods=3, tz='US/Eastern'), - date_range('20170102', periods=3, tz='US/Eastern'), - marks=pytest.mark.xfail(reason='fixed after rebase?'))]) - def test_repeats(self, left, right, repeats): - array = IntervalArray.from_arrays(left, right) - result = array.repeat(repeats) - - left_reps = [x for x in left for _ in range(repeats)] - right_reps = [x for x in right for _ in range(repeats)] + def test_repeat(self, left_right_dtypes, repeats): + left, right = left_right_dtypes + result = IntervalArray.from_arrays(left, right).repeat(repeats) expected = IntervalArray.from_arrays( - left_reps, right_reps, dtype=array.dtype) - + left.repeat(repeats), right.repeat(repeats)) tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize('bad_repeats, msg', [ + (-1, 'negative dimensions are not allowed'), + ('foo', r'invalid literal for int\(\) with base 10')]) + def test_repeat_errors(self, bad_repeats, msg): + array = IntervalArray.from_breaks(range(4)) + with tm.assert_raises_regex(ValueError, msg): + array.repeat(bad_repeats) + class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to skip all @@ -139,21 +150,14 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - @pytest.mark.parametrize('left, right', [ - (np.arange(3.0), np.arange(1.0, 4.0)), - ([0, 2, 4], [1, 3, 5]), - (timedelta_range('0 days', periods=3), - timedelta_range('1 day', periods=3)), - (date_range('20170101', periods=3), date_range('20170102', periods=3)), - pytest.param(date_range('20170101', periods=3, tz='US/Eastern'), - date_range('20170102', periods=3, tz='US/Eastern'), - marks=pytest.mark.xfail(reason='fixed after rebase?'))]) - def test_set_na(self, left, right): + def test_set_na(self, left_right_dtypes): + left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) result[0] = np.nan - expected = IntervalArray.from_arrays( - [np.nan] + list(left[1:]), [np.nan] + list(right[1:])) + expected_left = Index([left._na_value] + list(left[1:])) + expected_right = Index([right._na_value] + list(right[1:])) + expected = IntervalArray.from_arrays(expected_left, expected_right) self.assert_extension_array_equal(result, expected) From e0fe0bcf8ad9d07e7266c5342b6fe664e1c31a1e Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 7 Jul 2018 14:41:08 -0500 Subject: [PATCH 14/22] additional review fixups --- pandas/core/arrays/interval.py | 81 +++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 5f977cadaf8bf..2269a5182b050 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -477,9 +477,7 @@ def __setitem__(self, key, value): value = np.timedelta64('NaT') value_left, value_right = value, value elif is_interval_dtype(value) or isinstance(value, ABCInterval): - if value.closed != self.closed: - msg = "'value.closed' ({}) does not match {}." - raise ValueError(msg.format(value.closed, self.closed)) + self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right else: # wrong type: not interval or NA @@ -508,8 +506,8 @@ def fillna(self, value=None, method=None, limit=None): raise TypeError('limit is not supported for IntervalArray.') if not isinstance(value, ABCInterval): - msg = ("'Interval.fillna' only supports filling with a scalar " - "'pandas.Interval'. Got a '{}' instead." + msg = ("'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval'. Got a '{}' instead." .format(type(value).__name__)) raise TypeError(msg) @@ -525,8 +523,30 @@ def dtype(self): return IntervalDtype.construct_from_string(str(self.left.dtype)) def astype(self, dtype, copy=True): + """ + Cast to an ExtensionArray or NumPy array with dtype 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ExtensionArray or ndarray + ExtensionArray or NumPy ndarray with 'dtype' for its dtype. + """ dtype = pandas_dtype(dtype) - if is_interval_dtype(dtype) and dtype != self.dtype: + if is_interval_dtype(dtype): + if dtype == self.dtype: + return self.copy() if copy else self + + # need to cast to different subtype try: new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) @@ -535,11 +555,6 @@ def astype(self, dtype, copy=True): 'incompatible') raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) - elif is_interval_dtype(dtype): - if copy: - return self.copy() - else: - return self elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. @@ -551,6 +566,17 @@ def astype(self, dtype, copy=True): @classmethod def _concat_same_type(cls, to_concat): + """ + Concatenate multiple IntervalArray + + Parameters + ---------- + to_concat : sequence of IntervalArray + + Returns + ------- + IntervalArray + """ closed = set(interval.closed for interval in to_concat) if len(closed) != 1: raise ValueError("Intervals must all be closed on the same side.") @@ -584,8 +610,19 @@ def _shallow_copy(self, left=None, right=None): return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - # TODO: doc def copy(self, deep=False): + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + IntervalArray + """ left = self.left.copy(deep=True) if deep else self.left right = self.right.copy(deep=True) if deep else self.right closed = self.closed @@ -617,6 +654,26 @@ def itemsize(self): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + """ + Take elements from the IntervalArray. + + Parameters + ---------- + indexer : sequence of integers + indices to be taken. -1 is used to indicate values + that are missing. + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + fill_value : any, default None + Fill value to replace -1 values with. If applicable, this should + use the sentinel missing value for this type. + + Returns + ------- + IntervalArray + """ nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) left, right = self.left, self.right From 7c0ffd3cc719a9f5a341cf3f8638e141c1617357 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 7 Jul 2018 14:46:12 -0500 Subject: [PATCH 15/22] rename exceptions.py -> _exceptions.py --- pandas/core/indexes/interval.py | 2 +- pandas/util/{exceptions.py => _exceptions.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pandas/util/{exceptions.py => _exceptions.py} (100%) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 15a46ab774caf..4b367459e41e6 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -35,7 +35,7 @@ import pandas.core.common as com from pandas.util._decorators import cache_readonly, Appender from pandas.util._doctools import _WritableDoc -from pandas.util.exceptions import rewrite_exception +from pandas.util._exceptions import rewrite_exception from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset diff --git a/pandas/util/exceptions.py b/pandas/util/_exceptions.py similarity index 100% rename from pandas/util/exceptions.py rename to pandas/util/_exceptions.py From a33940ad498c25f8195bd3785fe09242166aef2a Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 8 Jul 2018 10:31:30 -0500 Subject: [PATCH 16/22] fix breaking tests after merge --- pandas/core/arrays/interval.py | 178 +++++++++++++++--------- pandas/core/dtypes/concat.py | 3 + pandas/core/dtypes/dtypes.py | 11 ++ pandas/core/dtypes/missing.py | 4 - pandas/core/indexes/interval.py | 4 +- pandas/tests/extension/base/getitem.py | 1 + pandas/tests/extension/test_interval.py | 17 ++- 7 files changed, 148 insertions(+), 70 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1d4fb8b643b84..16c67b573d25e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,12 +8,11 @@ import pandas.core.common as com from pandas.core.config import get_option from pandas.core.dtypes.cast import maybe_convert_platform -from pandas.core.dtypes.common import (_ensure_platform_int, - is_categorical_dtype, is_float_dtype, +from pandas.core.dtypes.common import (is_categorical_dtype, is_float_dtype, is_integer_dtype, is_interval_dtype, is_scalar, is_string_dtype, is_datetime64_any_dtype, - is_timedelta64_dtype, + is_timedelta64_dtype, is_interval, pandas_dtype) from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, @@ -192,7 +191,7 @@ def _simple_new(cls, left, right, closed=None, return result @classmethod - def _constructor_from_sequence(cls, scalars): + def _from_sequence(cls, scalars): return cls(scalars) @classmethod @@ -279,10 +278,10 @@ def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): See Also -------- interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_breaks : Construct an IntervalIndex from an array of + %(klass)s.from_breaks : Construct an %(klass)s from an array of splits. - %(klass)s.from_tuples : Construct an IntervalIndex from a - list/array of tuples. + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples. Examples @@ -340,12 +339,12 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): right array %(klass)s.from_breaks : Construct an %(klass)s from an array of splits - %(klass)s.from_tuples : Construct an %(klass)s from a - list/array of tuples + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples """ _interval_shared_docs['from_tuples'] = """ - Construct an %(klass)s from a list/array of tuples + Construct an %(klass)s from an array-like of tuples Parameters ---------- @@ -410,7 +409,7 @@ def from_tuples(cls, data, closed='right', copy=False, dtype=None): dtype=dtype) def _validate(self): - """Verify that the IntervalIndex is valid. + """Verify that the IntervalArray is valid. Checks that @@ -442,21 +441,19 @@ def __len__(self): return len(self.left) def __getitem__(self, value): - mask = self.isna()[value] - if is_scalar(mask) and mask: - return self._fill_value - left = self.left[value] right = self.right[value] # scalar if not isinstance(left, Index): + if isna(left): + return self._fill_value return Interval(left, right, self.closed) return self._shallow_copy(left, right) def __setitem__(self, key, value): - # need special casing to set directly on numpy arrays + # na value: need special casing to set directly on numpy arrays _needs_float_conversion = False if is_scalar(value) and isna(value): if is_integer_dtype(self.dtype.subtype): @@ -469,13 +466,21 @@ def __setitem__(self, key, value): # need proper NaT to set directly on the numpy array value = np.timedelta64('NaT') value_left, value_right = value, value + + # scalar interval elif is_interval_dtype(value) or isinstance(value, ABCInterval): self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right + else: - # wrong type: not interval or NA - msg = "'value' should be an interval type, got {} instead." - raise TypeError(msg.format(type(value))) + # list-like of intervals + try: + array = IntervalArray(value) + value_left, value_right = array.left, array.right + except TypeError: + # wrong type: not interval or NA + msg = "'value' should be an interval type, got {} instead." + raise TypeError(msg.format(type(value))) # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. @@ -588,7 +593,7 @@ def _shallow_copy(self, left=None, right=None, closed=None): elif right is None: - # only single value passed, could be an IntervalIndex + # only single value passed, could be an IntervalArray # or array of Intervals if not isinstance(left, (type(self), ABCIntervalIndex)): left = type(self)(left) @@ -645,55 +650,97 @@ def shape(self): def itemsize(self): return self.left.itemsize + self.right.itemsize - def take(self, indices, axis=0, allow_fill=True, fill_value=None, + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. Parameters ---------- - indexer : sequence of integers - indices to be taken. -1 is used to indicate values - that are missing. - allow_fill : bool, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - fill_value : any, default None - Fill value to replace -1 values with. If applicable, this should - use the sentinel missing value for this type. + indices : sequence of integers + Indices to be taken. + + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : Interval or NA, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + axis : any, default None + Present for compat with IntervalIndex; does nothing. Returns ------- IntervalArray - """ - nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) - left, right = self.left, self.right - if fill_value is None: - fill_value = self._na_value - mask = indices == -1 + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + """ + from pandas.core.algorithms import take - if not mask.any(): - # we won't change dtype here in this case - # if we don't need - allow_fill = False + nv.validate_take(tuple(), kwargs) - taker = lambda x: x.take(indices, allow_fill=allow_fill, - fill_value=fill_value) + fill_left = fill_right = fill_value + if allow_fill: + if fill_value is None: + fill_left = fill_right = self.left._na_value + elif is_interval(fill_value): + self._check_closed_matches(fill_value, name='fill_value') + fill_left, fill_right = fill_value.left, fill_value.right + elif not is_scalar(fill_value) and notna(fill_value): + msg = ("'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. Got a '{}' instead." + .format(type(fill_value).__name__)) + raise ValueError(msg) + + left_take = take(self.left, indices, + allow_fill=allow_fill, fill_value=fill_left) + right_take = take(self.right, indices, + allow_fill=allow_fill, fill_value=fill_right) + + return self._shallow_copy(left_take, right_take) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each interval. - try: - new_left = taker(left) - new_right = taker(right) - except ValueError: + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN. - # we need to coerce; migth have NA's in an - # integer dtype - new_left = taker(left.astype(float)) - new_right = taker(right.astype(float)) + Returns + ------- + counts : Series - return self._shallow_copy(new_left, new_right) + See Also + -------- + Series.value_counts + """ + # TODO: implement this is a non-naive way! + from pandas.core.algorithms import value_counts + return value_counts(np.asarray(self), dropna=dropna) # Formatting @@ -748,7 +795,7 @@ def _format_space(self): @property def left(self): """ - Return the left endpoints of each Interval in the IntervalIndex as + Return the left endpoints of each Interval in the IntervalArray as an Index """ return self._left @@ -756,7 +803,7 @@ def left(self): @property def right(self): """ - Return the right endpoints of each Interval in the IntervalIndex as + Return the right endpoints of each Interval in the IntervalArray as an Index """ return self._right @@ -810,20 +857,20 @@ def set_closed(self, closed): def length(self): """ Return an Index with entries denoting the length of each Interval in - the IntervalIndex + the IntervalArray """ try: return self.right - self.left except TypeError: # length not defined for some types, e.g. string - msg = ('IntervalIndex contains Intervals without defined length, ' + msg = ('IntervalArray contains Intervals without defined length, ' 'e.g. Intervals with string endpoints') raise TypeError(msg) @property def mid(self): """ - Return the midpoint of each Interval in the IntervalIndex as an Index + Return the midpoint of each Interval in the IntervalArray as an Index """ try: return 0.5 * (self.left + self.right) @@ -834,7 +881,7 @@ def mid(self): @property def is_non_overlapping_monotonic(self): """ - Return True if the IntervalIndex is non-overlapping (no Intervals share + Return True if the IntervalArray is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, else False """ @@ -856,7 +903,7 @@ def is_non_overlapping_monotonic(self): # Conversion def __array__(self, dtype=None): """ - Return the IntervalIndex's data as a numpy array of Interval + Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ left = self.left @@ -934,11 +981,11 @@ def repeat(self, repeats, **kwargs): def maybe_convert_platform_interval(values): """ - Try to do platform conversion, with special casing for IntervalIndex. + Try to do platform conversion, with special casing for IntervalArray. Wrapper around maybe_convert_platform that alters the default return - dtype in certain cases to be compatible with IntervalIndex. For example, + dtype in certain cases to be compatible with IntervalArray. For example, empty lists return with integer dtype instead of object dtype, which is - prohibited for IntervalIndex. + prohibited for IntervalArray. Parameters ---------- @@ -951,6 +998,9 @@ def maybe_convert_platform_interval(values): if isinstance(values, (list, tuple)) and len(values) == 0: # GH 19016 # empty lists/tuples get object dtype by default, but this is not - # prohibited for IntervalIndex, so coerce to integer instead + # prohibited for IntervalArray, so coerce to integer instead return np.array([], dtype=np.int64) + elif is_categorical_dtype(values): + values = np.asarray(values) + return maybe_convert_platform(values) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9f6813bc38464..feedc0ebd86f4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -15,6 +15,7 @@ is_period_dtype, is_object_dtype, is_bool_dtype, + is_interval_dtype, is_dtype_equal, _NS_DTYPE, _TD_DTYPE) @@ -58,6 +59,8 @@ def get_dtype_kinds(l): typ = 'bool' elif is_period_dtype(dtype): typ = str(arr.dtype) + elif is_interval_dtype(dtype): + typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 42c0878dd36dd..fbce7dc28dfe0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -750,6 +750,17 @@ def __new__(cls, subtype=None): cls._cache[str(subtype)] = u return u + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype + + Returns + ------- + type + """ + from pandas.core.arrays import IntervalArray + return IntervalArray + @classmethod def construct_from_string(cls, string): """ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ad4588f254174..cda94d02f013a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -196,10 +196,6 @@ def _isna_ndarraylike(obj): else: values = obj result = values.isna() - elif is_interval_dtype(values): - # TODO(IntervalArray): remove this if block - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() elif is_string_dtype(dtype): # Working around NumPy ticket 1542 shape = values.shape diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b6366240d0bb2..fc756ef3ed93d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -342,7 +342,9 @@ def set_closed(self, closed): msg = "invalid option for 'closed': {closed}" raise ValueError(msg.format(closed=closed)) - return self._shallow_copy(closed=closed) + # return self._shallow_copy(closed=closed) + array = self._data.set_closed(closed) + return self._simple_new(array, self.name) @property def length(self): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e9df49780f119..7eda28bd6e663 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -240,6 +240,7 @@ def test_reindex_non_na_fill_value(self, data_missing): na = data_missing[0] array = data_missing._from_sequence([na, valid]) + print(array) ser = pd.Series(array) result = ser.reindex([0, 1, 2], fill_value=valid) expected = pd.Series(data_missing._from_sequence([na, valid, valid])) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 21e998c4f9c44..959e33d14b3d3 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -48,6 +48,15 @@ def data_missing(): return IntervalArray.from_tuples([None, (0, 1)]) +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield IntervalArray(make_data()) + yield gen + + @pytest.fixture def data_for_sorting(): return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)]) @@ -76,7 +85,9 @@ class BaseInterval(object): class TestDtype(BaseInterval, base.BaseDtypeTests): - pass + + def test_array_type_with_arg(self, data, dtype): + assert dtype.construct_array_type() is IntervalArray class TestCasting(BaseInterval, base.BaseCastingTests): @@ -125,6 +136,10 @@ def test_set_closed(self, closed, new_closed): expected = IntervalArray.from_breaks(range(10), closed=new_closed) tm.assert_extension_array_equal(result, expected) + @pytest.mark.skip(reason='addition is not defined for intervals') + def test_combine_add(self, data_repeated): + pass + class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to skip all From 382737db9bdb3c0be37442958d443a26ad9230d1 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 8 Jul 2018 15:15:13 -0500 Subject: [PATCH 17/22] move test_interval --- pandas/tests/extension/interval/__init__.py | 0 pandas/tests/extension/{ => interval}/test_interval.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/extension/interval/__init__.py rename pandas/tests/extension/{ => interval}/test_interval.py (100%) diff --git a/pandas/tests/extension/interval/__init__.py b/pandas/tests/extension/interval/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/interval/test_interval.py similarity index 100% rename from pandas/tests/extension/test_interval.py rename to pandas/tests/extension/interval/test_interval.py From 95f8f151efc6efe6ce755b88f8e29288261dcfe5 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 9 Jul 2018 18:48:17 -0500 Subject: [PATCH 18/22] post-rebase review fixes --- doc/source/basics.rst | 1 + doc/source/whatsnew/v0.24.0.txt | 31 +++++------- pandas/core/arrays/interval.py | 20 +++++++- pandas/core/indexes/interval.py | 13 +++-- pandas/tests/extension/base/getitem.py | 1 - pandas/tests/extension/base/setitem.py | 31 ------------ .../tests/extension/interval/test_interval.py | 2 +- .../indexes/interval/test_construction.py | 16 +++++++ pandas/tests/util/test_hashing.py | 48 ++++++++++--------- 9 files changed, 82 insertions(+), 81 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 07b5c1f98815a..cd88d23236a38 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1934,6 +1934,7 @@ NumPy's type-system for a few cases. * :ref:`Categorical ` * :ref:`Datetime with Timezone ` +* :ref:`Period ` * Interval Pandas uses the ``object`` dtype for storing strings. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 211b6ea0e43db..5d01b0bcd86f4 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -72,8 +72,8 @@ Current Behavior: Storing Interval Data in Series and DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Interval data may now be stored in a Series or DataFrame, in addition to an -:class:`IntervalIndex` like before (:issue:`19453`). +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). .. ipython:: python @@ -81,24 +81,17 @@ Interval data may now be stored in a Series or DataFrame, in addition to an ser ser.dtype -Previously, these would be cast to a NumPy array of Interval objects. In general, +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, this should result in better performance when storing an array of intervals in -a Series. +a :class:`Series`. -Note that the ``.values`` of a Series containing intervals is no longer a NumPy -array. Rather, it's an ``ExtensionArray``, composed of two arrays ``left`` and -``right``. +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: .. ipython:: python ser.values -To recover the NumPy array of Interval objects, use :func:`numpy.asarray`: - -.. ipython:: python - - np.asarray(ser.values) - This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. @@ -133,9 +126,8 @@ Backwards incompatible API changes ``IntervalIndex.values`` is now an ``IntervalArray`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``.values`` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects -(:issue:`19453`). +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). Previous Behavior: @@ -158,11 +150,12 @@ New Behavior: This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. -For situations where you need an ``ndarray`` of Interval objects, use +For situations where you need an ``ndarray`` of ``Interval`` objects, use :meth:`numpy.asarray` or ``idx.astype(object)``. .. ipython:: python - + + np.asarray(idx) idx.values.astype(object) @@ -421,7 +414,7 @@ Interval ^^^^^^^^ - Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr missing a trailing comma at the end of the "data" section (:issue`20611`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue`20611`) - - diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 16c67b573d25e..7eccdef9ad929 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -172,7 +172,7 @@ def _simple_new(cls, left, right, closed=None, elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' - 'for IntervalIndex') + 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' @@ -586,6 +586,24 @@ def _concat_same_type(cls, to_concat): return cls._simple_new(left, right, closed=closed, copy=False) def _shallow_copy(self, left=None, right=None, closed=None): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : array-like + Values to be used for the left-side of the the intervals. + If None, the existing left and right values will be used. + + right : array-like + Values to be used for the right-side of the the intervals. + If None and left is IntervalArray-like, the left and right + of the IntervalArray-like will be used. + + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. If None, the existing closed will be used. + """ if left is None: # no values passed diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fc756ef3ed93d..9375a60d0964c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -163,7 +163,8 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, @classmethod def _simple_new(cls, array, name, closed=None): - """Construct from an IntervalArray + """ + Construct from an IntervalArray Parameters ---------- @@ -256,16 +257,18 @@ def contains(self, key): @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype=None): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) return cls._simple_new(array, name=name) @classmethod @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) def from_arrays(cls, left, right, closed='right', name=None, copy=False, dtype=None): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) return cls._simple_new(array, name=name) @classmethod diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 7eda28bd6e663..e9df49780f119 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -240,7 +240,6 @@ def test_reindex_non_na_fill_value(self, data_missing): na = data_missing[0] array = data_missing._from_sequence([na, valid]) - print(array) ser = pd.Series(array) result = ser.reindex([0, 1, 2], fill_value=valid) expected = pd.Series(data_missing._from_sequence([na, valid, valid])) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index f97f7fbae91b8..cf302b375bcb8 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -10,37 +10,6 @@ class BaseSetitemTests(BaseExtensionTests): - def test_set_scalar(self, data): - expected = data.take([1, 1]) - subset = data[:2].copy() - - subset[0] = data[1] - self.assert_extension_array_equal(subset, expected) - - def test_set_mask_scalar(self, data): - expected = data.take([1, 1, 2, 1]) - subset = data[:4].copy() - - subset[[True, True, False, True]] = data[1] - self.assert_extension_array_equal(subset, expected) - - @pytest.mark.parametrize('key', [ - [False, True, True, True], - [1, 2, 3], - ], ids=['mask', 'fancy']) - def test_set_array(self, key, data): - expected = data.take([0, 2, 2, 1]) - value = data.take([2, 2, 1]) - subset = data[:4].copy() - - subset[key] = value - self.assert_extension_array_equal(subset, expected) - - def test_bad_mask_bad_length_raise(self, data): - value = data[0] - with pytest.raises(IndexError): - data[[True, False]] = value - def test_setitem_scalar_series(self, data): arr = pd.Series(data) arr[0] = data[1] diff --git a/pandas/tests/extension/interval/test_interval.py b/pandas/tests/extension/interval/test_interval.py index 959e33d14b3d3..a10a56ddfdfac 100644 --- a/pandas/tests/extension/interval/test_interval.py +++ b/pandas/tests/extension/interval/test_interval.py @@ -121,7 +121,7 @@ def test_repeat(self, left_right_dtypes, repeats): @pytest.mark.parametrize('bad_repeats, msg', [ (-1, 'negative dimensions are not allowed'), - ('foo', r'invalid literal for int\(\) with base 10')]) + ('foo', r'invalid literal for (int|long)\(\) with base 10')]) def test_repeat_errors(self, bad_repeats, msg): array = IntervalArray.from_breaks(range(4)) with tm.assert_raises_regex(ValueError, msg): diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 10f174bfee9d2..d46e19ef56dd0 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -8,6 +8,7 @@ Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical, CategoricalIndex, date_range, timedelta_range, period_range, notna) from pandas.compat import lzip +from pandas.core.arrays import IntervalArray from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import IntervalDtype import pandas.core.common as com @@ -348,6 +349,17 @@ def test_override_inferred_closed(self, constructor, data, closed): result = constructor(data, closed=closed) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('values_constructor', [ + list, np.array, IntervalIndex, IntervalArray]) + def test_index_object_dtype(self, values_constructor): + # Index(intervals, dtype=object) is an Index (not an IntervalIndex) + intervals = [Interval(0, 1), Interval(1, 2), Interval(2, 3)] + values = values_constructor(intervals) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + class TestFromIntervals(TestClassConstructors): """ @@ -368,3 +380,7 @@ def test_deprecated(self): ivs = [Interval(0, 1), Interval(1, 2)] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): IntervalIndex.from_intervals(ivs) + + @pytest.mark.skip(reason='parent class test that is not applicable') + def test_index_object_dtype(self): + pass diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 82b870c156cc8..7badb0817c511 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -46,10 +46,11 @@ def test_hash_array_mixed(self): tm.assert_numpy_array_equal(result1, result2) tm.assert_numpy_array_equal(result1, result3) - def test_hash_array_errors(self): - - for val in [5, 'foo', pd.Timestamp('20130101')]: - pytest.raises(TypeError, hash_array, val) + @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) + def test_hash_array_errors(self, val): + msg = 'must pass a ndarray-like' + with tm.assert_raises_regex(TypeError, msg): + hash_array(val) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) @@ -80,26 +81,27 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] - def test_hash_tuple(self): + @pytest.mark.parametrize('tup', [ + (1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]) + def test_hash_tuple(self, tup): # test equivalence between hash_tuples and hash_tuple - for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), - ('A', pd.Timestamp("2012-01-01"))]: - result = hash_tuple(tup) - expected = hash_tuples([tup])[0] - assert result == expected - - def test_hash_scalar(self): - for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz='Europe/Brussels'), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), - pd.Timedelta('1 days'), datetime.timedelta(1), - pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), - np.nan, pd.NaT, None]: - result = _hash_scalar(val) - expected = hash_array(np.array([val], dtype=object), - categorize=True) - assert result[0] == expected[0] + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + @pytest.mark.parametrize('val', [ + 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]) + def test_hash_scalar(self, val): + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), categorize=True) + assert result[0] == expected[0] def test_hash_tuples_err(self): From e82eeb6df69f4c1a5e4fd0c07ec9e2027660f0c6 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 9 Jul 2018 20:19:13 -0500 Subject: [PATCH 19/22] unused import --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index cda94d02f013a..89a7f2ca53a09 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_interval_dtype, + is_timedelta64_dtype, is_period_dtype, is_complex_dtype, is_string_like_dtype, is_bool_dtype, From bccb4f7f54f59a7e9d9db63218e03b45dd61cb1b Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 9 Jul 2018 23:45:27 -0500 Subject: [PATCH 20/22] more review fixes --- pandas/core/arrays/interval.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 7eccdef9ad929..d6fe1118286d6 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -454,11 +454,11 @@ def __getitem__(self, value): def __setitem__(self, key, value): # na value: need special casing to set directly on numpy arrays - _needs_float_conversion = False + needs_float_conversion = False if is_scalar(value) and isna(value): if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array - _needs_float_conversion = True + needs_float_conversion = True elif is_datetime64_any_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array value = np.datetime64('NaT') @@ -485,13 +485,13 @@ def __setitem__(self, key, value): # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) - if _needs_float_conversion: + if needs_float_conversion: left = left.astype('float') left.values[key] = value_left self._left = left right = self.right.copy(deep=True) - if _needs_float_conversion: + if needs_float_conversion: right = right.astype('float') right.values[key] = value_right self._right = right @@ -505,7 +505,7 @@ def fillna(self, value=None, method=None, limit=None): if not isinstance(value, ABCInterval): msg = ("'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval'. Got a '{}' instead." + "scalar 'pandas.Interval'. Got a '{}' instead." .format(type(value).__name__)) raise TypeError(msg) @@ -580,7 +580,6 @@ def _concat_same_type(cls, to_concat): raise ValueError("Intervals must all be closed on the same side.") closed = closed.pop() - # TODO: avoid intermediate list left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) return cls._simple_new(left, right, closed=closed, copy=False) From 99ab41fccf8e54cf0387d489f4ddfc809bf8af94 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 10 Jul 2018 11:59:57 -0500 Subject: [PATCH 21/22] more review fixes --- doc/source/basics.rst | 2 +- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/arrays/interval.py | 26 ++++++++++++++++++++++++++ pandas/tests/extension/base/setitem.py | 1 - 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index cd88d23236a38..c18b94fea9a28 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1935,7 +1935,7 @@ NumPy's type-system for a few cases. * :ref:`Categorical ` * :ref:`Datetime with Timezone ` * :ref:`Period ` -* Interval +* :ref:`Interval ` Pandas uses the ``object`` dtype for storing strings. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5d01b0bcd86f4..6ffa7ebf994e5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -414,7 +414,7 @@ Interval ^^^^^^^^ - Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue`20611`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) - - diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d6fe1118286d6..4ad53e16bc439 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -497,6 +497,32 @@ def __setitem__(self, key, value): self._right = right def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should be either Interval objects or NA/NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + (Not implemented yet for IntervalArray) + Method to use for filling holes in reindexed Series + limit : int, default None + (Not implemented yet for IntervalArray) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : IntervalArray with NA/NaN filled + """ if method is not None: raise TypeError('Filling by method is not supported for ' 'IntervalArray.') diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index cf302b375bcb8..4e27f1eca538f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -9,7 +9,6 @@ class BaseSetitemTests(BaseExtensionTests): - def test_setitem_scalar_series(self, data): arr = pd.Series(data) arr[0] = data[1] From 385ce59f79ff1921a44f3bbaa17044032e3ed4d3 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Thu, 12 Jul 2018 19:09:21 -0500 Subject: [PATCH 22/22] undo parametrize in test_hasing.py and move to separate PR --- pandas/tests/util/test_hashing.py | 48 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 7badb0817c511..82b870c156cc8 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -46,11 +46,10 @@ def test_hash_array_mixed(self): tm.assert_numpy_array_equal(result1, result2) tm.assert_numpy_array_equal(result1, result3) - @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) - def test_hash_array_errors(self, val): - msg = 'must pass a ndarray-like' - with tm.assert_raises_regex(TypeError, msg): - hash_array(val) + def test_hash_array_errors(self): + + for val in [5, 'foo', pd.Timestamp('20130101')]: + pytest.raises(TypeError, hash_array, val) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) @@ -81,27 +80,26 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] - @pytest.mark.parametrize('tup', [ - (1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), - ('A', pd.Timestamp("2012-01-01"))]) - def test_hash_tuple(self, tup): + def test_hash_tuple(self): # test equivalence between hash_tuples and hash_tuple - result = hash_tuple(tup) - expected = hash_tuples([tup])[0] - assert result == expected - - @pytest.mark.parametrize('val', [ - 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz='Europe/Brussels'), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), - pd.Timedelta('1 days'), datetime.timedelta(1), - pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), - np.nan, pd.NaT, None]) - def test_hash_scalar(self, val): - result = _hash_scalar(val) - expected = hash_array(np.array([val], dtype=object), categorize=True) - assert result[0] == expected[0] + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]: + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + def test_hash_scalar(self): + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]: + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), + categorize=True) + assert result[0] == expected[0] def test_hash_tuples_err(self):