Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate numpy and dask vindex handling #3

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions xarray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ def get_array(self):
return self.datastore.ds.variables[self.variable_name].data

def __getitem__(self, key):
if isinstance(key, OuterIndexer):
key = key.vectorize(self.shape)

key = to_tuple(key)
with self.datastore.ensure_open(autoclose=True):
data = NumpyIndexingAdapter(self.get_array())[key]
# Copy data if the source file is mmapped.
Expand Down
143 changes: 70 additions & 73 deletions xarray/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pandas as pd

from . import nputils
from . import utils
from .npcompat import moveaxis
from .pycompat import (iteritems, range, integer_types, dask_array_type,
Expand Down Expand Up @@ -283,32 +284,6 @@ class OuterIndexer(IndexerTuple):
""" Tuple for outer/orthogonal indexing.
All the item is one of integer, slice, and 1d-np.ndarray.
"""
def vectorize(self, shape):
""" Convert to a vectorized indexer.
shape: shape of the array subject to the indexing.
"""
if len([k for k in self if not isinstance(k, slice)]) <= 1:
# if there is only one vector and all others are slice,
# it can be safely converted to vectorized indexer
# Boolean index should be converted to integer array.
return VectorizedIndexer(self)
else:
n_dim = len([k for k in self if not isinstance(k, integer_types)])
i_dim = 0
new_key = []
for k, size in zip(self, shape):
if isinstance(k, integer_types):
new_key.append(k)
else: # np.ndarray or slice
if isinstance(k, slice):
k = np.arange(*k.indices(size))
if k.dtype.kind == 'b':
(k, ) = k.nonzero()
shape = [(1,) * i_dim + (k.size, ) +
(1,) * (n_dim - i_dim - 1)]
new_key.append(k.reshape(*shape))
i_dim += 1
return VectorizedIndexer(new_key)


class VectorizedIndexer(IndexerTuple):
Expand Down Expand Up @@ -444,6 +419,44 @@ def xarray_indexable(array):
return array


def _outer_to_numpy_indexer(key, shape):
"""Convert an OuterIndexer into an indexer for NumPy.

Parameters
----------
key : OuterIndexer
Outer indexing tuple to convert.
shape : tuple
Shape of the array subject to the indexing.

Returns
-------
tuple
Base tuple suitable for use to index a NumPy array.
"""
if len([k for k in key if not isinstance(k, slice)]) <= 1:
# If there is only one vector and all others are slice,
# it can be safely used in mixed basic/advanced indexing.
# Boolean index should already be converted to integer array.
return tuple(key)

n_dim = len([k for k in key if not isinstance(k, integer_types)])
i_dim = 0
new_key = []
for k, size in zip(key, shape):
if isinstance(k, integer_types):
new_key.append(k)
else: # np.ndarray or slice
if isinstance(k, slice):
k = np.arange(*k.indices(size))
assert k.dtype.kind == 'i'
shape = [(1,) * i_dim + (k.size, ) +
(1,) * (n_dim - i_dim - 1)]
new_key.append(k.reshape(*shape))
i_dim += 1
return tuple(new_key)


class NumpyIndexingAdapter(utils.NDArrayMixin):
"""Wrap a NumPy array to use broadcasted indexing
"""
Expand All @@ -459,17 +472,24 @@ def _ensure_ndarray(self, value):
value = utils.to_0d_array(value)
return value

def __getitem__(self, key):
def _indexing_array_and_key(self, key):
if isinstance(key, OuterIndexer):
key = key.vectorize(self.shape)
key = to_tuple(key)
return self._ensure_ndarray(self.array[key])
key = _outer_to_numpy_indexer(key, self.array.shape)

if isinstance(key, VectorizedIndexer):
array = nputils.NumpyVIndexAdapter(self.array)
else:
array = self.array

return array, to_tuple(key)

def __getitem__(self, key):
array, key = self._indexing_array_and_key(key)
return self._ensure_ndarray(array[key])

def __setitem__(self, key, value):
if isinstance(key, OuterIndexer):
key = key.vectorize(self.shape)
key = to_tuple(key)
self.array[key] = value
array, key = self._indexing_array_and_key(key)
array[key] = value


class DaskIndexingAdapter(utils.NDArrayMixin):
Expand All @@ -482,51 +502,28 @@ def __init__(self, array):
self.array = array

def __getitem__(self, key):
# should always get PointwiseIndexer instead
assert not isinstance(key, VectorizedIndexer)

if isinstance(key, PointwiseIndexer):
return self._getitem_pointwise(key)

try:
key = to_tuple(key)
return self.array[key]
except NotImplementedError:
# manual orthogonal indexing.
value = self.array
for axis, subkey in reversed(list(enumerate(key))):
value = value[(slice(None),) * axis + (subkey,)]
return value

def _getitem_pointwise(self, key):
pointwise_shape, pointwise_index = next(
(k.shape, i) for i, k in enumerate(key)
if not isinstance(k, slice))
# dask's indexing only handles 1d arrays
flat_key = tuple(k if isinstance(k, slice) else k.ravel()
for k in key)

if len([k for k in key if not isinstance(k, slice)]) == 1:
# vindex requires more than one non-slice :(
# but we can use normal indexing instead
indexed = self.array[flat_key]
new_shape = (indexed.shape[:pointwise_index] +
pointwise_shape +
indexed.shape[pointwise_index + 1:])
return indexed.reshape(new_shape)
if isinstance(key, BasicIndexer):
return self.array[tuple(key)]
elif isinstance(key, VectorizedIndexer):
return self.array.vindex[tuple(key)]
else:
indexed = self.array.vindex[flat_key]
# vindex always moves slices to the end
reshaped = indexed.reshape(pointwise_shape + indexed.shape[1:])
# reorder dimensions to match order of appearance
positions = np.arange(0, len(pointwise_shape))
return moveaxis(reshaped, positions, positions + pointwise_index)
assert isinstance(key, OuterIndexer)
key = tuple(key)
try:
return self.array[key]
except NotImplementedError:
# manual orthogonal indexing.
# TODO: port this upstream into dask in a saner way.
value = self.array
for axis, subkey in reversed(list(enumerate(key))):
value = value[(slice(None),) * axis + (subkey,)]
return value

def __setitem__(self, key, value):
raise TypeError("this variable's data is stored in a dask array, "
'which does not support item assignment. To '
'assign to this variable, you must first load it '
'into memory explicitly using the .load_data() '
'into memory explicitly using the .load() '
'method or accessing its .values attribute.')


Expand Down
54 changes: 54 additions & 0 deletions xarray/core/nputils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pandas as pd
import warnings

from .npcompat import moveaxis


def _validate_axis(data, axis):
ndim = data.ndim
Expand Down Expand Up @@ -79,3 +81,55 @@ def array_ne(self, other):
with warnings.catch_warnings():
warnings.filterwarnings('ignore', r'elementwise comparison failed')
return _ensure_bool_is_ndarray(self != other, self, other)


def _is_contiguous(positions):
"""Given a non-empty list, does it consist of contiguous integers?"""
previous = positions[0]
for current in positions[1:]:
if current != previous + 1:
return False
previous = current
return True


def _advanced_indexer_subspaces(key):
"""Indices of the advanced indexes subspaces for mixed indexing and vindex.
"""
if not isinstance(key, tuple):
key = (key,)
advanced_index_positions = [i for i, k in enumerate(key)
if not isinstance(k, slice)]

if (not advanced_index_positions or
not _is_contiguous(advanced_index_positions)):
# Nothing to reorder: dimensions on the indexing result are already
# ordered like vindex. See NumPy's rule for "Combining advanced and
# basic indexing":
# https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
return (), ()

non_slices = [k for k in key if not isinstance(k, slice)]
ndim = len(np.broadcast(*non_slices).shape)
mixed_positions = advanced_index_positions[0] + np.arange(ndim)
vindex_positions = np.arange(ndim)
return mixed_positions, vindex_positions


class NumpyVIndexAdapter(object):
"""Object that implements indexing like vindex on a np.ndarray.

This is a pure Python implementation of (some of) the logic in this NumPy
proposal: https://github.com/numpy/numpy/pull/6256
"""
def __init__(self, array):
self._array = array

def __getitem__(self, key):
mixed_positions, vindex_positions = _advanced_indexer_subspaces(key)
return moveaxis(self._array[key], mixed_positions, vindex_positions)

def __setitem__(self, key, value):
"""Value must have dimensionality matching the key."""
mixed_positions, vindex_positions = _advanced_indexer_subspaces(key)
self._array[key] = moveaxis(value, vindex_positions, mixed_positions)
39 changes: 38 additions & 1 deletion xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import itertools
import re
import warnings
from collections import Mapping, MutableMapping, Iterable
from collections import Mapping, MutableMapping, MutableSet, Iterable

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -378,6 +378,43 @@ def __len__(self):
raise len(iter(self))


class OrderedSet(MutableSet):
"""A simple ordered set.

The API matches the builtin set, but it preserves insertion order of
elements, like an OrderedDict.
"""
def __init__(self, values=None):
self._ordered_dict = OrderedDict()
if values is not None:
self |= values

# Required methods for MutableSet

def __contains__(self, value):
return value in self._ordered_dict

def __iter__(self):
return iter(self._ordered_dict)

def __len__(self):
return len(self._ordered_dict)

def add(self, value):
self._ordered_dict[value] = None

def discard(self, value):
del self._ordered_dict[value]

# Additional methods

def update(self, values):
self |= values

def __repr__(self):
return '%s(%r)' % (type(self).__name__, list(self))


class NdimSizeLenMixin(object):
"""Mixin class that extends a class that defines a ``shape`` property to
one that also defines ``ndim``, ``size`` and ``__len__``.
Expand Down
Loading