From 105bd646007083aef6fe4c8540cc66caa5d18e16 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 8 Jul 2017 00:00:32 +0900 Subject: [PATCH 001/113] Implemented `_broadcast_indexes` in Variable.py --- xarray/core/variable.py | 95 +++++++++++++++++++++++++++++++++++ xarray/tests/test_variable.py | 52 +++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ad4836b930f..6ea0de3f88a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -28,6 +28,7 @@ pass + def as_variable(obj, name=None): """Convert an object into a Variable. @@ -406,6 +407,100 @@ def __getitem__(self, key): return type(self)(dims, values, self._attrs, self._encoding, fastpath=True) + def _broadcast_indexes(self, key): + """ + Parameters + ----------- + key: One of + array + a mapping of dimension names to index. + + Returns + ------- + indexers: list of integer, array-like, or slice. This is aligned + along self.dims. + dims: Tuple of strings. + Dimension of the resultant variable. + """ + if not utils.is_dict_like(key): + key = {self.dims[0]: key} + example_v = None + indexes = OrderedDict() + for k, v in key.items(): + if not isinstance(v, (integer_types, slice, Variable)): + if not hasattr(key, 'ndim'): # convert list or tuple + v = np.array(v) + if example_v is None and isinstance(v, Variable): + example_v = v + indexes[k] = v + + # When all the keys are array or integer, slice + if example_v is None: + # more than one (unlabelled) arrays + if len([v for k, v in indexes.items() + if not isinstance(v, (integer_types, slice))]) > 1: + raise IndexError("Indexing with multiple unlabelled arrays " + "is not allowed.") + # multi-dimensional unlabelled array + if any([v.ndim > 1 for k, v in indexes.items() + if not isinstance(v, integer_types)]): + raise IndexError("Indexing with a multi-dimensional unlabelled" + "array is not allowed.") + # convert the array into Variable + for k, v in indexes.items(): + if not hasattr(v, 'dims'): + indexes[k] = type(self)([k], v) + example_v = v + + for k, v in indexes.items(): + # Found unlabelled array + if not isinstance(v, (Variable, integer_types, slice)): + if (v.ndim > example_v.ndim or + any([example_v.ndim != v.ndim for k, v + in indexes.items() if isinstance(v, Variable)])): + raise IndexError("Broadcasting failed because dimensions " + "does not match.") + else: + _, indexes[k], _ = _broadcast_compat_data(example_v, v) + + # now indexes is a list. + index_tuple = tuple(indexes.get(d, slice(None)) for d in self.dims) + index_tuple = indexing.expanded_indexer(index_tuple, self.ndim) + + # comput dims + dims = [] + for i, d in enumerate(self.dims): + if d in indexes.keys(): + if isinstance(v, Variable): + for d in v.dims: + if d not in dims: + dims.append(d) + else: + dims.append(d) + + return dims, index_tuple + + def getitem2(self, variables): + """Return a new Array object whose contents are consistent with + getting the provided key from the underlying data. + + Parameters + ----------- + variables: Variable or a dict mapping dimension to Variables. + + This method will replace original __getitem__ after we confirm its + stability. + """ + dims, key = self._broadcast_indexes(key) + key = indexing.expanded_indexer(key, self.ndim) + values = self._indexable_data[key] + if hasattr(values, 'ndim'): + assert values.ndim == len(dims), (values.ndim, len(dims)) + else: + assert len(dims) == 0, len(dims) + return type(self)(dims, values, self._attrs, self._encoding, + fastpath=True) + def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy values with orthogonal indexing. diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f5d207d0978..97c764be75e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -709,6 +709,58 @@ def test_items(self): v[range(10), range(11)] = 1 self.assertArrayEqual(v.values, np.ones((10, 11))) + def test_bloadcast_indexes(self): + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + with self.assertRaisesRegexp(IndexError, "Indexing with multiple"): + v._broadcast_indexes(dict(x=[0, 1], y=[0, 1])) + + with self.assertRaisesRegexp(IndexError, "Indexing with a multi-"): + v._broadcast_indexes([[0, 1], [1, 2]]) + + dims, index_tuple = v._broadcast_indexes([0, 1]) + self.assertTrue(dims == ['x', 'y']) + self.assertTrue(np.allclose(index_tuple[0], [0, 1])) + self.assertTrue(index_tuple[1] == slice(None, None, None)) + + ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) + dims, index_tuple = v._broadcast_indexes(ind) + self.assertTrue(dims == ['a', 'b', 'y']) + self.assertTrue(np.allclose(index_tuple[0], [[0, 1, 2], [2, 1, 0]])) + self.assertTrue(index_tuple[1] == slice(None, None, None)) + + ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) + dims, index_tuple = v._broadcast_indexes(dict(y=ind)) + self.assertTrue(dims == ['x', 'a', 'b']) + self.assertTrue(index_tuple[0] == slice(None, None, None)) + self.assertTrue(np.allclose(index_tuple[1], [[0, 1, 2], [2, 1, 0]])) + + # with broadcast + ind = Variable(['a'], [0, 1]) + dims, index_tuple = v._broadcast_indexes(dict(x=[0, 1], y=ind)) + self.assertTrue(dims == ['a']) + self.assertTrue(np.allclose(index_tuple[0], [0, 1])) + self.assertTrue(np.allclose(index_tuple[1], [0, 1])) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + dims, index_tuple = v._broadcast_indexes(dict(x=[[1, 0], [1, 0]], + y=ind)) + self.assertTrue(dims == ['a', 'b']) + self.assertTrue(np.allclose(index_tuple[0], [[1, 0], [1, 0]])) + self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) + + # broadcast impossible case + with self.assertRaisesRegexp(IndexError, "Broadcasting failed "): + ind = Variable(['a'], [0, 1]) + dims, index_tuple = v._broadcast_indexes(dict(x=[[1, 0], [1, 0]], + y=ind)) + # with integer + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + dims, index_tuple = v._broadcast_indexes(dict(x=0, y=ind)) + self.assertTrue(dims == ['a', 'b']) + self.assertTrue(np.allclose(index_tuple[0], 0)) + self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) + def test_isel(self): v = Variable(['time', 'x'], self.d) self.assertVariableIdentical(v.isel(time=slice(None)), v) From 23b4fe0a8c6a5787fac391879d7c19a09a511048 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 10 Jul 2017 09:37:00 +0900 Subject: [PATCH 002/113] Diagonal indexing for Variable. --- xarray/core/variable.py | 20 ++++++++------------ xarray/tests/test_variable.py | 32 +++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6ea0de3f88a..ceeffb2a895 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -417,10 +417,10 @@ def _broadcast_indexes(self, key): Returns ------- - indexers: list of integer, array-like, or slice. This is aligned - along self.dims. dims: Tuple of strings. Dimension of the resultant variable. + indexers: list of integer, array-like, or slice. This is aligned + along self.dims. """ if not utils.is_dict_like(key): key = {self.dims[0]: key} @@ -463,7 +463,6 @@ def _broadcast_indexes(self, key): else: _, indexes[k], _ = _broadcast_compat_data(example_v, v) - # now indexes is a list. index_tuple = tuple(indexes.get(d, slice(None)) for d in self.dims) index_tuple = indexing.expanded_indexer(index_tuple, self.ndim) @@ -480,20 +479,17 @@ def _broadcast_indexes(self, key): return dims, index_tuple - def getitem2(self, variables): + def getitem2(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. - Parameters - ----------- - variables: Variable or a dict mapping dimension to Variables. + NB. __getitem__ and __setitem__ implement "diagonal indexing" like + np.ndarray. - This method will replace original __getitem__ after we confirm its - stability. + This method will replace __getitem__ after we make sure its stability. """ - dims, key = self._broadcast_indexes(key) - key = indexing.expanded_indexer(key, self.ndim) - values = self._indexable_data[key] + dims, index_tuple = self._broadcast_indexes(key) + values = self._data[index_tuple] if hasattr(values, 'ndim'): assert values.ndim == len(dims), (values.ndim, len(dims)) else: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 97c764be75e..d1f67e64fb8 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -709,31 +709,41 @@ def test_items(self): v[range(10), range(11)] = 1 self.assertArrayEqual(v.values, np.ones((10, 11))) - def test_bloadcast_indexes(self): + def test_getitem2(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) with self.assertRaisesRegexp(IndexError, "Indexing with multiple"): - v._broadcast_indexes(dict(x=[0, 1], y=[0, 1])) + v.getitem2(dict(x=[0, 1], y=[0, 1])) with self.assertRaisesRegexp(IndexError, "Indexing with a multi-"): - v._broadcast_indexes([[0, 1], [1, 2]]) + v.getitem2([[0, 1], [1, 2]]) dims, index_tuple = v._broadcast_indexes([0, 1]) self.assertTrue(dims == ['x', 'y']) self.assertTrue(np.allclose(index_tuple[0], [0, 1])) self.assertTrue(index_tuple[1] == slice(None, None, None)) + v_new = v.getitem2([0, 1]) + self.assertTrue(v_new.dims == ('x', 'y')) + self.assertArrayEqual(v_new, v._data[[0, 1]]) - ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) + ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) dims, index_tuple = v._broadcast_indexes(ind) self.assertTrue(dims == ['a', 'b', 'y']) - self.assertTrue(np.allclose(index_tuple[0], [[0, 1, 2], [2, 1, 0]])) + self.assertTrue(np.allclose(index_tuple[0], [[0, 1, 1], [1, 1, 0]])) self.assertTrue(index_tuple[1] == slice(None, None, None)) + v_new = v.getitem2(ind) + self.assertTrue(v_new.dims == ('a', 'b', 'y')) + self.assertArrayEqual(v_new, v._data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) dims, index_tuple = v._broadcast_indexes(dict(y=ind)) self.assertTrue(dims == ['x', 'a', 'b']) + self.assertTrue(len(index_tuple) == 2) self.assertTrue(index_tuple[0] == slice(None, None, None)) self.assertTrue(np.allclose(index_tuple[1], [[0, 1, 2], [2, 1, 0]])) + v_new = v.getitem2(dict(y=ind)) + self.assertTrue(v_new.dims == ('x', 'a', 'b')) + self.assertArrayEqual(v_new, v._data[:, ([0, 1, 2], [2, 1, 0])]) # with broadcast ind = Variable(['a'], [0, 1]) @@ -741,6 +751,8 @@ def test_bloadcast_indexes(self): self.assertTrue(dims == ['a']) self.assertTrue(np.allclose(index_tuple[0], [0, 1])) self.assertTrue(np.allclose(index_tuple[1], [0, 1])) + v_new = v.getitem2(dict(x=[0, 1], y=ind)) + self.assertArrayEqual(v_new, v._data[[0, 1], [0, 1]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) dims, index_tuple = v._broadcast_indexes(dict(x=[[1, 0], [1, 0]], @@ -748,18 +760,24 @@ def test_bloadcast_indexes(self): self.assertTrue(dims == ['a', 'b']) self.assertTrue(np.allclose(index_tuple[0], [[1, 0], [1, 0]])) self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) + v_new = v.getitem2(dict(x=[[1, 0], [1, 0]], y=ind)) + self.assertArrayEqual(v_new, + v._data[([1, 0], [1, 0]), ([0, 0], [1, 1])]) # broadcast impossible case with self.assertRaisesRegexp(IndexError, "Broadcasting failed "): ind = Variable(['a'], [0, 1]) - dims, index_tuple = v._broadcast_indexes(dict(x=[[1, 0], [1, 0]], - y=ind)) + v.getitem2(dict(x=[[1, 0], [1, 0]], y=ind)) + # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) dims, index_tuple = v._broadcast_indexes(dict(x=0, y=ind)) self.assertTrue(dims == ['a', 'b']) self.assertTrue(np.allclose(index_tuple[0], 0)) self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) + v_new = v.getitem2(dict(x=0, y=ind)) + self.assertArrayEqual(v_new, + v._data[0, ([0, 0], [1, 1])]) def test_isel(self): v = Variable(['time', 'x'], self.d) From 726ba5ddef8841ad3d2a25dfa4b1e78dc90e1810 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 12 Jul 2017 18:59:18 +0900 Subject: [PATCH 003/113] update _broadcast_indexes. update tests. --- xarray/core/variable.py | 88 +++++++++++----------------- xarray/tests/test_variable.py | 107 ++++++++++++++++++++-------------- 2 files changed, 97 insertions(+), 98 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ceeffb2a895..615ec1853c3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -422,62 +422,40 @@ def _broadcast_indexes(self, key): indexers: list of integer, array-like, or slice. This is aligned along self.dims. """ - if not utils.is_dict_like(key): - key = {self.dims[0]: key} - example_v = None - indexes = OrderedDict() - for k, v in key.items(): - if not isinstance(v, (integer_types, slice, Variable)): - if not hasattr(key, 'ndim'): # convert list or tuple - v = np.array(v) - if example_v is None and isinstance(v, Variable): - example_v = v - indexes[k] = v - - # When all the keys are array or integer, slice - if example_v is None: - # more than one (unlabelled) arrays - if len([v for k, v in indexes.items() - if not isinstance(v, (integer_types, slice))]) > 1: - raise IndexError("Indexing with multiple unlabelled arrays " - "is not allowed.") - # multi-dimensional unlabelled array - if any([v.ndim > 1 for k, v in indexes.items() - if not isinstance(v, integer_types)]): - raise IndexError("Indexing with a multi-dimensional unlabelled" - "array is not allowed.") - # convert the array into Variable - for k, v in indexes.items(): - if not hasattr(v, 'dims'): - indexes[k] = type(self)([k], v) - example_v = v - - for k, v in indexes.items(): - # Found unlabelled array - if not isinstance(v, (Variable, integer_types, slice)): - if (v.ndim > example_v.ndim or - any([example_v.ndim != v.ndim for k, v - in indexes.items() if isinstance(v, Variable)])): - raise IndexError("Broadcasting failed because dimensions " - "does not match.") - else: - _, indexes[k], _ = _broadcast_compat_data(example_v, v) - - index_tuple = tuple(indexes.get(d, slice(None)) for d in self.dims) - index_tuple = indexing.expanded_indexer(index_tuple, self.ndim) - - # comput dims - dims = [] - for i, d in enumerate(self.dims): - if d in indexes.keys(): - if isinstance(v, Variable): - for d in v.dims: - if d not in dims: - dims.append(d) - else: - dims.append(d) + key = self._item_key_to_tuple(key) # key is a tuple + # key is a tuple of full size + key = indexing.expanded_indexer(key, self.ndim) + basic_indexing_types = integer_types + (slice,) + if all([isinstance(k, basic_indexing_types) for k in key]): + return self._broadcast_indexes_basic(key) + else: + return self._broadcast_indexes_advanced(key) + + def _broadcast_indexes_basic(self, key): + dims = tuple(dim for k, dim in zip(key, self.dims) + if not isinstance(k, integer_types)) + return dims, key + + def _broadcast_indexes_advanced(self, key): + variables = [] - return dims, index_tuple + for dim, value in zip(self.dims, key): + if isinstance(value, slice): + value = np.arange(self.sizes[dim])[value] + + try: # TODO we need our own Exception. + variable = as_variable(value, name=dim) + except ValueError as e: + if "cannot set variable" in str(e): + raise IndexError("Unlabelled multi-dimensional array " + "cannot be used for indexing.") + else: + raise e + variables.append(variable) + variables = _broadcast_compat_variables(*variables) + dims = variables[0].dims # all variables have the same dims + key = tuple(variable.data for variable in variables) + return dims, key def getitem2(self, key): """Return a new Array object whose contents are consistent with diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d1f67e64fb8..36762b0331c 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -709,75 +709,96 @@ def test_items(self): v[range(10), range(11)] = 1 self.assertArrayEqual(v.values, np.ones((10, 11))) - def test_getitem2(self): + def test_getitem2_basic(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - with self.assertRaisesRegexp(IndexError, "Indexing with multiple"): - v.getitem2(dict(x=[0, 1], y=[0, 1])) + v_new = v.getitem2(dict(x=0)) + self.assertTrue(v_new.dims == ('y', )) + self.assertArrayEqual(v_new, v._data[0]) - with self.assertRaisesRegexp(IndexError, "Indexing with a multi-"): - v.getitem2([[0, 1], [1, 2]]) + v_new = v.getitem2(dict(x=0, y=slice(None))) + self.assertTrue(v_new.dims == ('y', )) + self.assertArrayEqual(v_new, v._data[0]) + + v_new = v.getitem2(dict(x=0, y=1)) + self.assertTrue(v_new.dims == ()) + self.assertArrayEqual(v_new, v._data[0, 1]) + + v_new = v.getitem2(dict(y=1)) + self.assertTrue(v_new.dims == ('x', )) + self.assertArrayEqual(v_new, v._data[:, 1]) + + # tuple argument + v_new = v.getitem2((slice(None), 1)) + self.assertTrue(v_new.dims == ('x', )) + self.assertArrayEqual(v_new, v._data[:, 1]) + + def test_getitem2_advanced(self): + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + # orthogonal indexing + v_new = v.getitem2(([0, 1], [1, 0])) + self.assertTrue(v_new.dims == ('x', 'y')) + self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) - dims, index_tuple = v._broadcast_indexes([0, 1]) - self.assertTrue(dims == ['x', 'y']) - self.assertTrue(np.allclose(index_tuple[0], [0, 1])) - self.assertTrue(index_tuple[1] == slice(None, None, None)) v_new = v.getitem2([0, 1]) self.assertTrue(v_new.dims == ('x', 'y')) self.assertArrayEqual(v_new, v._data[[0, 1]]) ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) - dims, index_tuple = v._broadcast_indexes(ind) - self.assertTrue(dims == ['a', 'b', 'y']) - self.assertTrue(np.allclose(index_tuple[0], [[0, 1, 1], [1, 1, 0]])) - self.assertTrue(index_tuple[1] == slice(None, None, None)) v_new = v.getitem2(ind) self.assertTrue(v_new.dims == ('a', 'b', 'y')) self.assertArrayEqual(v_new, v._data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) - dims, index_tuple = v._broadcast_indexes(dict(y=ind)) - self.assertTrue(dims == ['x', 'a', 'b']) - self.assertTrue(len(index_tuple) == 2) - self.assertTrue(index_tuple[0] == slice(None, None, None)) - self.assertTrue(np.allclose(index_tuple[1], [[0, 1, 2], [2, 1, 0]])) v_new = v.getitem2(dict(y=ind)) self.assertTrue(v_new.dims == ('x', 'a', 'b')) self.assertArrayEqual(v_new, v._data[:, ([0, 1, 2], [2, 1, 0])]) - # with broadcast + # with mixed arguments ind = Variable(['a'], [0, 1]) - dims, index_tuple = v._broadcast_indexes(dict(x=[0, 1], y=ind)) - self.assertTrue(dims == ['a']) - self.assertTrue(np.allclose(index_tuple[0], [0, 1])) - self.assertTrue(np.allclose(index_tuple[1], [0, 1])) v_new = v.getitem2(dict(x=[0, 1], y=ind)) - self.assertArrayEqual(v_new, v._data[[0, 1], [0, 1]]) + self.assertTrue(v_new.dims == ('x', 'a')) + self.assertArrayEqual(v_new, v._data[[0, 1]][:, [0, 1]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - dims, index_tuple = v._broadcast_indexes(dict(x=[[1, 0], [1, 0]], - y=ind)) - self.assertTrue(dims == ['a', 'b']) - self.assertTrue(np.allclose(index_tuple[0], [[1, 0], [1, 0]])) - self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) - v_new = v.getitem2(dict(x=[[1, 0], [1, 0]], y=ind)) - self.assertArrayEqual(v_new, - v._data[([1, 0], [1, 0]), ([0, 0], [1, 1])]) - - # broadcast impossible case - with self.assertRaisesRegexp(IndexError, "Broadcasting failed "): - ind = Variable(['a'], [0, 1]) - v.getitem2(dict(x=[[1, 0], [1, 0]], y=ind)) + v_new = v.getitem2(dict(x=[1, 0], y=ind)) + self.assertTrue(v_new.dims == ('x', 'a', 'b')) + self.assertArrayEqual(v_new, v._data[[1, 0]][:, ind]) # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - dims, index_tuple = v._broadcast_indexes(dict(x=0, y=ind)) - self.assertTrue(dims == ['a', 'b']) - self.assertTrue(np.allclose(index_tuple[0], 0)) - self.assertTrue(np.allclose(index_tuple[1], [[0, 0], [1, 1]])) v_new = v.getitem2(dict(x=0, y=ind)) - self.assertArrayEqual(v_new, - v._data[0, ([0, 0], [1, 1])]) + self.assertTrue(v_new.dims == ('a', 'b')) + self.assertArrayEqual(v_new[0], v._data[0][[0, 0]]) + self.assertArrayEqual(v_new[1], v._data[0][[1, 1]]) + + # with slice + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v.getitem2(dict(x=slice(None), y=ind)) + self.assertTrue(v_new.dims == ('x', 'a', 'b')) + self.assertArrayEqual(v_new, v._data[:, [[0, 0], [1, 1]]]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v.getitem2(dict(x=ind, y=slice(None))) + self.assertTrue(v_new.dims == ('a', 'b', 'y')) + self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], :]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v.getitem2(dict(x=ind, y=slice(None, 1))) + self.assertTrue(v_new.dims == ('a', 'b', 'y')) + self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], slice(None, 1)]) + + def test_getitem2_error(self): + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): + v.getitem2([[0, 1], [1, 2]]) + + with self.assertRaisesRegexp(ValueError, "operands cannot be "): + ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) + ind_y = Variable(['a'], [0]) + v.getitem2((ind_x, ind_y)) def test_isel(self): v = Variable(['time', 'x'], self.d) From df7011fbbede8d5911bdbe510c469f2f6820f6b0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 16 Jul 2017 00:45:41 +0900 Subject: [PATCH 004/113] Support basic boolean indexing. --- xarray/core/variable.py | 75 +++++++++++++++++++---------------- xarray/tests/test_variable.py | 51 ++++++++++++++---------- 2 files changed, 72 insertions(+), 54 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 615ec1853c3..557d45f9d94 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -28,7 +28,6 @@ pass - def as_variable(obj, name=None): """Convert an object into a Variable. @@ -377,36 +376,6 @@ def _item_key_to_tuple(self, key): else: return key - def __getitem__(self, key): - """Return a new Array object whose contents are consistent with - getting the provided key from the underlying data. - - NB. __getitem__ and __setitem__ implement "orthogonal indexing" like - netCDF4-python, where the key can only include integers, slices - (including `Ellipsis`) and 1d arrays, each of which are applied - orthogonally along their respective dimensions. - - The difference does not matter in most cases unless you are using - numpy's "fancy indexing," which can otherwise result in data arrays - whose shapes is inconsistent (or just uninterpretable with) with the - variable's dimensions. - - If you really want to do indexing like `x[x > 0]`, manipulate the numpy - array `x.values` directly. - """ - key = self._item_key_to_tuple(key) - key = indexing.expanded_indexer(key, self.ndim) - dims = tuple(dim for k, dim in zip(key, self.dims) - if not isinstance(k, integer_types)) - values = self._indexable_data[key] - # orthogonal indexing should ensure the dimensionality is consistent - if hasattr(values, 'ndim'): - assert values.ndim == len(dims), (values.ndim, len(dims)) - else: - assert len(dims) == 0, len(dims) - return type(self)(dims, values, self._attrs, self._encoding, - fastpath=True) - def _broadcast_indexes(self, key): """ Parameters @@ -436,6 +405,25 @@ def _broadcast_indexes_basic(self, key): if not isinstance(k, integer_types)) return dims, key + def nonzero(self): + """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ + if isinstance(self._data, (np.ndarray, pd.Index, PandasIndexAdapter)): + nonzeros = np.nonzero(self._data) + elif isinstance(self._data, dask_array_type): + # TODO we should replace dask's native nonzero + # after https://github.com/dask/dask/issues/1076 is implemented. + nonzeros = np.nonzero(self.load()._data) + + return tuple([as_variable(nz, name=dim) for nz, dim + in zip(nonzeros, self.dims)]) + + def _isbool(self): + """ Return if the variabe is bool or not """ + if isinstance(self._data, (np.ndarray, PandasIndexAdapter, pd.Index)): + return self._data.dtype is np.dtype('bool') + elif isinstance(self._data, dask_array_type): + raise NotImplementedError + def _broadcast_indexes_advanced(self, key): variables = [] @@ -451,13 +439,26 @@ def _broadcast_indexes_advanced(self, key): "cannot be used for indexing.") else: raise e - variables.append(variable) + if variable._isbool(): # boolean indexing case + variables.extend(list(variable.nonzero())) + else: + variables.append(variable) variables = _broadcast_compat_variables(*variables) dims = variables[0].dims # all variables have the same dims key = tuple(variable.data for variable in variables) return dims, key - def getitem2(self, key): + def _ensure_array(self, value): + """ For np.ndarray-based-Variable, we always want the result of + indexing to be a NumPy array. If it's not, then it really should be a + 0d array. Doing the coercion here instead of inside + variable.as_compatible_data makes it less error prone.""" + if isinstance(self._data, np.ndarray): + if not isinstance(value, np.ndarray): + value = utils.to_0d_array(value) + return value + + def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. @@ -467,7 +468,13 @@ def getitem2(self, key): This method will replace __getitem__ after we make sure its stability. """ dims, index_tuple = self._broadcast_indexes(key) - values = self._data[index_tuple] + try: + values = self._ensure_array(self._data[index_tuple]) + except NotImplementedError: + # TODO temporal implementation. + # Need to wait for dask's nd index support? + values = self._ensure_array(self.load()._data[index_tuple]) + if hasattr(values, 'ndim'): assert values.ndim == len(dims), (values.ndim, len(dims)) else: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 36762b0331c..4c137c6c3e6 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -474,6 +474,7 @@ class TestVariable(TestCase, VariableSubclassTestCases): def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) + self.isdask = False def test_data_and_values(self): v = Variable(['time', 'x'], self.d) @@ -680,6 +681,8 @@ def test_repr_lazy_data(self): def test_items(self): data = np.random.random((10, 11)) v = Variable(['x', 'y'], data) + if self.isdask: + v = v.chunk() # test slicing self.assertVariableIdentical(v, v[:]) self.assertVariableIdentical(v, v[...]) @@ -709,96 +712,96 @@ def test_items(self): v[range(10), range(11)] = 1 self.assertArrayEqual(v.values, np.ones((10, 11))) - def test_getitem2_basic(self): + def test_getitem_basic(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - v_new = v.getitem2(dict(x=0)) + v_new = v[dict(x=0)] self.assertTrue(v_new.dims == ('y', )) self.assertArrayEqual(v_new, v._data[0]) - v_new = v.getitem2(dict(x=0, y=slice(None))) + v_new = v[dict(x=0, y=slice(None))] self.assertTrue(v_new.dims == ('y', )) self.assertArrayEqual(v_new, v._data[0]) - v_new = v.getitem2(dict(x=0, y=1)) + v_new = v[dict(x=0, y=1)] self.assertTrue(v_new.dims == ()) self.assertArrayEqual(v_new, v._data[0, 1]) - v_new = v.getitem2(dict(y=1)) + v_new = v[dict(y=1)] self.assertTrue(v_new.dims == ('x', )) self.assertArrayEqual(v_new, v._data[:, 1]) # tuple argument - v_new = v.getitem2((slice(None), 1)) + v_new = v[(slice(None), 1)] self.assertTrue(v_new.dims == ('x', )) self.assertArrayEqual(v_new, v._data[:, 1]) - def test_getitem2_advanced(self): + def test_getitem_advanced(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) # orthogonal indexing - v_new = v.getitem2(([0, 1], [1, 0])) + v_new = v[([0, 1], [1, 0])] self.assertTrue(v_new.dims == ('x', 'y')) self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) - v_new = v.getitem2([0, 1]) + v_new = v[[0, 1]] self.assertTrue(v_new.dims == ('x', 'y')) self.assertArrayEqual(v_new, v._data[[0, 1]]) ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) - v_new = v.getitem2(ind) + v_new = v[ind] self.assertTrue(v_new.dims == ('a', 'b', 'y')) self.assertArrayEqual(v_new, v._data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) - v_new = v.getitem2(dict(y=ind)) + v_new = v[dict(y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) self.assertArrayEqual(v_new, v._data[:, ([0, 1, 2], [2, 1, 0])]) # with mixed arguments ind = Variable(['a'], [0, 1]) - v_new = v.getitem2(dict(x=[0, 1], y=ind)) + v_new = v[dict(x=[0, 1], y=ind)] self.assertTrue(v_new.dims == ('x', 'a')) self.assertArrayEqual(v_new, v._data[[0, 1]][:, [0, 1]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v.getitem2(dict(x=[1, 0], y=ind)) + v_new = v[dict(x=[1, 0], y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) self.assertArrayEqual(v_new, v._data[[1, 0]][:, ind]) # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v.getitem2(dict(x=0, y=ind)) + v_new = v[dict(x=0, y=ind)] self.assertTrue(v_new.dims == ('a', 'b')) self.assertArrayEqual(v_new[0], v._data[0][[0, 0]]) self.assertArrayEqual(v_new[1], v._data[0][[1, 1]]) # with slice ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v.getitem2(dict(x=slice(None), y=ind)) + v_new = v[dict(x=slice(None), y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) self.assertArrayEqual(v_new, v._data[:, [[0, 0], [1, 1]]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v.getitem2(dict(x=ind, y=slice(None))) + v_new = v[dict(x=ind, y=slice(None))] self.assertTrue(v_new.dims == ('a', 'b', 'y')) self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], :]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v.getitem2(dict(x=ind, y=slice(None, 1))) + v_new = v[dict(x=ind, y=slice(None, 1))] self.assertTrue(v_new.dims == ('a', 'b', 'y')) self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], slice(None, 1)]) - def test_getitem2_error(self): + def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): - v.getitem2([[0, 1], [1, 2]]) + v[[[0, 1], [1, 2]]] with self.assertRaisesRegexp(ValueError, "operands cannot be "): ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) ind_y = Variable(['a'], [0]) - v.getitem2((ind_x, ind_y)) + v[(ind_x, ind_y)] def test_isel(self): v = Variable(['time', 'x'], self.d) @@ -1179,6 +1182,14 @@ def test_count(self): self.assertVariableIdentical(expected, actual) +class TestVariable_withDask(TestVariable): + cls = staticmethod(Variable) + + def setUp(self): + super(TestVariable_withDask, self).setUp() + self.isdask = True + + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From f9232cbc712fde01adff9e789b881c4d726cb9fc Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 16 Jul 2017 16:26:09 +0900 Subject: [PATCH 005/113] tests for dask-based Variable --- xarray/core/variable.py | 5 +++++ xarray/tests/test_variable.py | 14 +++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 557d45f9d94..75ab78a23ec 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -456,6 +456,11 @@ def _ensure_array(self, value): if isinstance(self._data, np.ndarray): if not isinstance(value, np.ndarray): value = utils.to_0d_array(value) + elif isinstance(self._data, dask_array_type): + print(value) + if not isinstance(value, (dask_array_type, dask_array_type)): + value = utils.to_0d_array(value) + return value def __getitem__(self, key): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 4c137c6c3e6..cc2bfba4915 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -142,9 +142,9 @@ def test_0d_object_array_with_list(self): listarray = np.empty((1,), dtype=object) listarray[0] = [1, 2, 3] x = self.cls('x', listarray) - assert x.data == listarray - assert x[0].data == listarray.squeeze() - assert x.squeeze().data == listarray.squeeze() + self.assertArrayEqual(x.data, listarray) + self.assertArrayEqual(x[0].data, listarray.squeeze()) + self.assertArrayEqual(x.squeeze().data, listarray.squeeze()) def test_index_and_concat_datetime(self): # regression test for #125 @@ -474,7 +474,6 @@ class TestVariable(TestCase, VariableSubclassTestCases): def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) - self.isdask = False def test_data_and_values(self): v = Variable(['time', 'x'], self.d) @@ -681,8 +680,6 @@ def test_repr_lazy_data(self): def test_items(self): data = np.random.random((10, 11)) v = Variable(['x', 'y'], data) - if self.isdask: - v = v.chunk() # test slicing self.assertVariableIdentical(v, v[:]) self.assertVariableIdentical(v, v[...]) @@ -1182,12 +1179,12 @@ def test_count(self): self.assertVariableIdentical(expected, actual) +@pytest.mark.xfail class TestVariable_withDask(TestVariable): - cls = staticmethod(Variable) + cls = staticmethod(lambda *args: Variable(*args).chunk()) def setUp(self): super(TestVariable_withDask, self).setUp() - self.isdask = True class TestIndexVariable(TestCase, VariableSubclassTestCases): @@ -1269,7 +1266,6 @@ def test_coordinate_alias(self): self.assertIsInstance(x, IndexVariable) - class TestAsCompatibleData(TestCase): def test_unchanged_types(self): types = (np.asarray, PandasIndexAdapter, indexing.LazilyIndexedArray) From 17b646542547ef7c4e14603557b9c664f9fc66bc Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 16 Jul 2017 17:07:00 +0900 Subject: [PATCH 006/113] Explicitly mark xfail flags --- xarray/tests/test_variable.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cc2bfba4915..1dea1f71c9b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -208,6 +208,7 @@ def test_pandas_data(self): def test_pandas_period_index(self): v = self.cls(['x'], pd.period_range(start='2000', periods=20, freq='B')) + v = v.load() # for dask-based Variable self.assertEqual(v[0], pd.Period('2000', freq='B')) assert "Period('2000-01-03', 'B')" in repr(v) @@ -463,10 +464,11 @@ def test_load(self): array = self.cls('x', np.arange(5)) orig_data = array._data copied = array.copy(deep=True) - array.load() - assert type(array._data) is type(orig_data) - assert type(copied._data) is type(orig_data) - self.assertVariableIdentical(array, copied) + if array.chunks is None: + array.load() + assert type(array._data) is type(orig_data) + assert type(copied._data) is type(orig_data) + self.assertVariableIdentical(array, copied) class TestVariable(TestCase, VariableSubclassTestCases): @@ -1179,13 +1181,29 @@ def test_count(self): self.assertVariableIdentical(expected, actual) -@pytest.mark.xfail class TestVariable_withDask(TestVariable): cls = staticmethod(lambda *args: Variable(*args).chunk()) def setUp(self): super(TestVariable_withDask, self).setUp() + @pytest.mark.xfail + def test_0d_object_array_with_list(self): + super(TestVariable_withDask, self).test_0d_object_array_with_list() + + @pytest.mark.xfail + def test_array_interface(self): + # dask array does not have `argsort` + super(TestVariable_withDask, self).test_array_interface() + + @pytest.mark.xfail + def test_copy_index(self): + super(TestVariable_withDask, self).test_copy_index() + + @pytest.mark.xfail + def test_eq_all_dtypes(self): + super(TestVariable_withDask, self).test_eq_all_dtypes() + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From 33c51d3de084590c3fbdf76713031b0ef0eb60ad Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 17 Jul 2017 01:24:37 +0900 Subject: [PATCH 007/113] orthogonal indexing for dask. --- xarray/core/indexing.py | 44 +++++++++++++++-------------------- xarray/core/variable.py | 38 +++++++----------------------- xarray/tests/test_variable.py | 19 ++++++++------- 3 files changed, 38 insertions(+), 63 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index ef8200eb451..49360c03b94 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -371,7 +371,7 @@ def shape(self): return tuple(shape) def __array__(self, dtype=None): - array = orthogonally_indexable(self.array) + array = broadcasted_indexable(self.array) return np.asarray(array[self.key], dtype=None) def __getitem__(self, key): @@ -434,7 +434,7 @@ def __setitem__(self, key, value): self.array[key] = value -def orthogonally_indexable(array): +def broadcasted_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) if isinstance(array, pd.Index): @@ -445,24 +445,10 @@ def orthogonally_indexable(array): class NumpyIndexingAdapter(utils.NDArrayMixin): - """Wrap a NumPy array to use orthogonal indexing (array indexing - accesses different dimensions independently, like netCDF4-python variables) + """Wrap a NumPy array to use broadcasted indexing """ - # note: this object is somewhat similar to biggus.NumpyArrayAdapter in that - # it implements orthogonal indexing, except it casts to a numpy array, - # isn't lazy and supports writing values. def __init__(self, array): - self.array = np.asarray(array) - - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) - - def _convert_key(self, key): - key = expanded_indexer(key, self.ndim) - if any(not isinstance(k, integer_types + (slice,)) for k in key): - # key would trigger fancy indexing - key = orthogonal_indexer(key, self.shape) - return key + self.array = array def _ensure_ndarray(self, value): # We always want the result of indexing to be a NumPy array. If it's @@ -474,29 +460,37 @@ def _ensure_ndarray(self, value): return value def __getitem__(self, key): - key = self._convert_key(key) return self._ensure_ndarray(self.array[key]) def __setitem__(self, key, value): - key = self._convert_key(key) self.array[key] = value class DaskIndexingAdapter(utils.NDArrayMixin): - """Wrap a dask array to support orthogonal indexing + """Wrap a dask array to support broadcasted-indexing. """ def __init__(self, array): self.array = array def __getitem__(self, key): - key = expanded_indexer(key, self.ndim) - if any(not isinstance(k, integer_types + (slice,)) for k in key): + """ key: tuple of Variable, slice, integer """ + # basic or orthogonal indexing + if all(isinstance(k, (integer_types, slice)) or k.squeeze().ndim <= 1 + for k in key): value = self.array for axis, subkey in reversed(list(enumerate(key))): + if hasattr(subkey, 'squeeze'): + subkey = subkey.squeeze() + if subkey.ndim == 0: # make at least 1-d array + subkey = subkey.flatten() value = value[(slice(None),) * axis + (subkey,)] + return value else: - value = self.array[key] - return value + # TODO Dask does not support nd-array indexing. + # flatten() -> .vindex[] -> reshape() should be used + # instead of `.load()` + value = np.asarray(self.array)[key] + return value class PandasIndexAdapter(utils.NDArrayMixin): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 75ab78a23ec..93efdb52ca1 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,7 +18,8 @@ from . import utils from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) -from .indexing import (PandasIndexAdapter, orthogonally_indexable) +from .indexing import (DaskIndexingAdapter, PandasIndexAdapter, + broadcasted_indexable) import xarray as xr # only for Dataset and DataArray @@ -297,7 +298,7 @@ def data(self, data): @property def _indexable_data(self): - return orthogonally_indexable(self._data) + return broadcasted_indexable(self._data) def load(self): """Manually trigger loading of this variable's data from disk or a @@ -417,7 +418,7 @@ def nonzero(self): return tuple([as_variable(nz, name=dim) for nz, dim in zip(nonzeros, self.dims)]) - def _isbool(self): + def _isbool_type(self): """ Return if the variabe is bool or not """ if isinstance(self._data, (np.ndarray, PandasIndexAdapter, pd.Index)): return self._data.dtype is np.dtype('bool') @@ -439,7 +440,7 @@ def _broadcast_indexes_advanced(self, key): "cannot be used for indexing.") else: raise e - if variable._isbool(): # boolean indexing case + if variable._isbool_type(): # boolean indexing case variables.extend(list(variable.nonzero())) else: variables.append(variable) @@ -448,21 +449,6 @@ def _broadcast_indexes_advanced(self, key): key = tuple(variable.data for variable in variables) return dims, key - def _ensure_array(self, value): - """ For np.ndarray-based-Variable, we always want the result of - indexing to be a NumPy array. If it's not, then it really should be a - 0d array. Doing the coercion here instead of inside - variable.as_compatible_data makes it less error prone.""" - if isinstance(self._data, np.ndarray): - if not isinstance(value, np.ndarray): - value = utils.to_0d_array(value) - elif isinstance(self._data, dask_array_type): - print(value) - if not isinstance(value, (dask_array_type, dask_array_type)): - value = utils.to_0d_array(value) - - return value - def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. @@ -473,13 +459,7 @@ def __getitem__(self, key): This method will replace __getitem__ after we make sure its stability. """ dims, index_tuple = self._broadcast_indexes(key) - try: - values = self._ensure_array(self._data[index_tuple]) - except NotImplementedError: - # TODO temporal implementation. - # Need to wait for dask's nd index support? - values = self._ensure_array(self.load()._data[index_tuple]) - + values = self._indexable_data[index_tuple] if hasattr(values, 'ndim'): assert values.ndim == len(dims), (values.ndim, len(dims)) else: @@ -493,15 +473,15 @@ def __setitem__(self, key, value): See __getitem__ for more details. """ - key = self._item_key_to_tuple(key) + dims, index_tuple = self._broadcast_indexes(key) if isinstance(self._data, dask_array_type): raise TypeError("this variable's data is stored in a dask array, " 'which does not support item assignment. To ' 'assign to this variable, you must first load it ' 'into memory explicitly using the .load_data() ' 'method or accessing its .values attribute.') - data = orthogonally_indexable(self._data) - data[key] = value + data = broadcasted_indexable(self._data) + data[index_tuple] = value @property def attrs(self): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 1dea1f71c9b..54adf210519 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -750,46 +750,47 @@ def test_getitem_advanced(self): ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) v_new = v[ind] self.assertTrue(v_new.dims == ('a', 'b', 'y')) - self.assertArrayEqual(v_new, v._data[([0, 1, 1], [1, 1, 0]), :]) + self.assertArrayEqual(v_new, v.load()._data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) v_new = v[dict(y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) - self.assertArrayEqual(v_new, v._data[:, ([0, 1, 2], [2, 1, 0])]) + self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) # with mixed arguments ind = Variable(['a'], [0, 1]) v_new = v[dict(x=[0, 1], y=ind)] self.assertTrue(v_new.dims == ('x', 'a')) - self.assertArrayEqual(v_new, v._data[[0, 1]][:, [0, 1]]) + self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=[1, 0], y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) - self.assertArrayEqual(v_new, v._data[[1, 0]][:, ind]) + self.assertArrayEqual(v_new, v.load()._data[[1, 0]][:, ind]) # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=0, y=ind)] self.assertTrue(v_new.dims == ('a', 'b')) - self.assertArrayEqual(v_new[0], v._data[0][[0, 0]]) - self.assertArrayEqual(v_new[1], v._data[0][[1, 1]]) + self.assertArrayEqual(v_new[0], v.load()._data[0][[0, 0]]) + self.assertArrayEqual(v_new[1], v.load()._data[0][[1, 1]]) # with slice ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=slice(None), y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) - self.assertArrayEqual(v_new, v._data[:, [[0, 0], [1, 1]]]) + self.assertArrayEqual(v_new, v.load()._data[:, [[0, 0], [1, 1]]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None))] self.assertTrue(v_new.dims == ('a', 'b', 'y')) - self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], :]) + self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], :]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None, 1))] self.assertTrue(v_new.dims == ('a', 'b', 'y')) - self.assertArrayEqual(v_new, v._data[[[0, 0], [1, 1]], slice(None, 1)]) + self.assertArrayEqual(v_new, + v.load()._data[[[0, 0], [1, 1]], slice(None, 1)]) def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) From 03a336fa2bc43cb3af689a92227174be5dc74aff Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Jul 2017 15:48:18 -0700 Subject: [PATCH 008/113] Refactor DaskArrayAdapter --- xarray/core/indexing.py | 68 ++++++++++++++++++++++++++--------- xarray/tests/test_indexing.py | 24 +++++++++++++ 2 files changed, 75 insertions(+), 17 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 49360c03b94..75eb0607936 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -72,6 +72,41 @@ def _expand_slice(slice_, size): return np.arange(*slice_.indices(size)) +def maybe_convert_to_slice(indexer, size): + """Convert an indexer into an equivalent slice object, if possible. + + Arguments + --------- + indexer : int, slice or np.ndarray + If a numpy array, must have integer dtype. + size : integer + Integer size of the dimension to be indexed. + """ + if indexer.ndim != 1 or not isinstance(indexer, np.ndarray): + return indexer + + if indexer.size == 0: + return slice(0, 0) + + if indexer.min() < -size or indexer.max() >= size: + raise IndexError( + 'indexer has elements out of bounds for axis of size {}: {}' + .format(size, indexer)) + + indexer = np.where(indexer < 0, indexer + size, indexer) + if indexer.size == 1: + i = int(indexer[0]) + return slice(i, i + 1) + + start = int(indexer[0]) + step = int(indexer[1] - start) + stop = start + step * indexer.size + guess = slice(start, stop, step) + if np.array_equal(_expand_slice(guess, size), indexer): + return guess + return indexer + + def orthogonal_indexer(key, shape): """Given a key for orthogonal array indexing, returns an equivalent key suitable for indexing a numpy.ndarray with fancy indexing. @@ -473,24 +508,23 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - """ key: tuple of Variable, slice, integer """ - # basic or orthogonal indexing - if all(isinstance(k, (integer_types, slice)) or k.squeeze().ndim <= 1 - for k in key): - value = self.array - for axis, subkey in reversed(list(enumerate(key))): - if hasattr(subkey, 'squeeze'): - subkey = subkey.squeeze() - if subkey.ndim == 0: # make at least 1-d array - subkey = subkey.flatten() - value = value[(slice(None),) * axis + (subkey,)] - return value + """ key: tuple of ndarray, slice, integer """ + if all(isinstance(k, integer_types + (slice,)) for k in key): + # basic indexing + return self.array[key] + elif all(k.shape == (1,) * (i - 1) + (max(k.shape),) + (1,) * (i - 1) + for i, k in enumerate(key) + if isinstance(k, np.ndarray)): + # orthogonal indexing + # dask only supports one list in an indexer, so convert to slice if + # possible + key = tuple(maybe_convert_to_slice(np.ravel(k), size) + for k, size in zip(key, self.shape)) + return self.array[key] + # TODO: handle point-wise indexing with vindex else: - # TODO Dask does not support nd-array indexing. - # flatten() -> .vindex[] -> reshape() should be used - # instead of `.load()` - value = np.asarray(self.array)[key] - return value + raise IndexError( + 'dask does not support fancy indexing with key: {}'.format(key)) class PandasIndexAdapter(utils.NDArrayMixin): diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 3866e0511a5..6d478a68b5f 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -29,6 +29,30 @@ def test_expanded_indexer(self): with self.assertRaisesRegexp(IndexError, 'too many indices'): indexing.expanded_indexer(I[1, 2, 3], 2) + def test_maybe_convert_to_slice(self): + + cases = [ + (1,), + (1, 1), + (1, 2), + (10,), + (0, 10), + (5, 10), + (5, 8), + (None, 5), + (None, -3), + (0, 10, 2), + (10, None, -1), + (7, 3, -2), + ] + for case in cases: + slice_obj = slice(*case) + base_array = np.arange(*slice_obj.indices(10)) + for array in [base_array, base_array - 10]: + actual = indexing.maybe_convert_to_slice(array, 10) + self.assertArrayEqual(np.arange(10)[actual], + np.arange(10)[slice_obj]) + def test_orthogonal_indexer(self): x = np.random.randn(10, 11, 12, 13, 14) y = np.arange(5) From 866de9174cef853d60f8639857c954eed7baec29 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 17 Jul 2017 16:35:49 +0900 Subject: [PATCH 009/113] Added MissingDimensionsError. Improve DaskIndexingAdapter, Variable.__getitem__ --- xarray/core/indexing.py | 31 ++++++++++++----- xarray/core/variable.py | 59 +++++++++++++++----------------- xarray/tests/test_variable.py | 63 +++++++++++++++++++++++++++++++---- 3 files changed, 106 insertions(+), 47 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 75eb0607936..f5bf17d0a52 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -496,7 +496,6 @@ def _ensure_ndarray(self, value): def __getitem__(self, key): return self._ensure_ndarray(self.array[key]) - def __setitem__(self, key, value): self.array[key] = value @@ -505,26 +504,42 @@ class DaskIndexingAdapter(utils.NDArrayMixin): """Wrap a dask array to support broadcasted-indexing. """ def __init__(self, array): + """ This adapter is usually called in Variable.__getitem__ with + array=Variable._broadcast_indexes + """ self.array = array - def __getitem__(self, key): + def _broadcast_indexes(self, key): """ key: tuple of ndarray, slice, integer """ if all(isinstance(k, integer_types + (slice,)) for k in key): # basic indexing - return self.array[key] - elif all(k.shape == (1,) * (i - 1) + (max(k.shape),) + (1,) * (i - 1) - for i, k in enumerate(key) - if isinstance(k, np.ndarray)): + return key + elif all(k.shape == + (1,) * i + (max(k.shape),) + (1,) * (k.ndim - i - 1) + for i, k in enumerate(key) if hasattr(k, 'shape')): # orthogonal indexing # dask only supports one list in an indexer, so convert to slice if # possible key = tuple(maybe_convert_to_slice(np.ravel(k), size) for k, size in zip(key, self.shape)) - return self.array[key] + return key # TODO: handle point-wise indexing with vindex else: raise IndexError( - 'dask does not support fancy indexing with key: {}'.format(key)) + 'dask does not support fancy indexing with key: {}'.format(key)) + + def __getitem__(self, key): + key = self._broadcast_indexes(key) + return self.array[key] + + def __setitem__(self, key, value): + key = self._broadcast_indexes(key) + raise TypeError("this variable's data is stored in a dask array, " + 'which does not support item assignment. To ' + 'assign to this variable, you must first load it ' + 'into memory explicitly using the .load_data() ' + 'method or accessing its .values attribute.') + self.array[key] = value class PandasIndexAdapter(utils.NDArrayMixin): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 93efdb52ca1..1c478aa8aca 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,8 +18,7 @@ from . import utils from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) -from .indexing import (DaskIndexingAdapter, PandasIndexAdapter, - broadcasted_indexable) +from .indexing import (PandasIndexAdapter, broadcasted_indexable) import xarray as xr # only for Dataset and DataArray @@ -29,6 +28,13 @@ pass +class MissingDimensionsError(ValueError): + """Error class used when we can't safely guess a dimension name. + """ + # inherits from ValueError for backward compatibility + # TODO: move this to an xarray.exceptions module? + + def as_variable(obj, name=None): """Convert an object into a Variable. @@ -87,7 +93,7 @@ def as_variable(obj, name=None): elif name is not None: data = as_compatible_data(obj) if data.ndim != 1: - raise ValueError( + raise MissingDimensionsError( 'cannot set variable %r with %r-dimensional data ' 'without explicit dimension names. Pass a tuple of ' '(dims, data) instead.' % (name, data.ndim)) @@ -99,7 +105,7 @@ def as_variable(obj, name=None): if name is not None and name in obj.dims: # convert the Variable into an Index if obj.ndim != 1: - raise ValueError( + raise MissingDimensionsError( '%r has more than 1-dimension and the same name as one of its ' 'dimensions %r. xarray disallows such variables because they ' 'conflict with the coordinates used to label dimensions.' @@ -408,23 +414,12 @@ def _broadcast_indexes_basic(self, key): def nonzero(self): """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ - if isinstance(self._data, (np.ndarray, pd.Index, PandasIndexAdapter)): - nonzeros = np.nonzero(self._data) - elif isinstance(self._data, dask_array_type): - # TODO we should replace dask's native nonzero - # after https://github.com/dask/dask/issues/1076 is implemented. - nonzeros = np.nonzero(self.load()._data) - + # TODO we should replace dask's native nonzero + # after https://github.com/dask/dask/issues/1076 is implemented. + nonzeros = np.nonzero(self.data) return tuple([as_variable(nz, name=dim) for nz, dim in zip(nonzeros, self.dims)]) - def _isbool_type(self): - """ Return if the variabe is bool or not """ - if isinstance(self._data, (np.ndarray, PandasIndexAdapter, pd.Index)): - return self._data.dtype is np.dtype('bool') - elif isinstance(self._data, dask_array_type): - raise NotImplementedError - def _broadcast_indexes_advanced(self, key): variables = [] @@ -432,15 +427,16 @@ def _broadcast_indexes_advanced(self, key): if isinstance(value, slice): value = np.arange(self.sizes[dim])[value] - try: # TODO we need our own Exception. + try: variable = as_variable(value, name=dim) - except ValueError as e: - if "cannot set variable" in str(e): - raise IndexError("Unlabelled multi-dimensional array " - "cannot be used for indexing.") - else: - raise e - if variable._isbool_type(): # boolean indexing case + except MissingDimensionsError: # change to better exception + raise IndexError("Unlabelled multi-dimensional array " + "cannot be used for indexing.") + + if variable.dtype.kind == 'b': # boolean indexing case + if variable.ndim > 1: + raise IndexError("{}-dimensional boolean indexing is " + "not supported. ".format(variable.ndim)) variables.extend(list(variable.nonzero())) else: variables.append(variable) @@ -474,14 +470,11 @@ def __setitem__(self, key, value): See __getitem__ for more details. """ dims, index_tuple = self._broadcast_indexes(key) - if isinstance(self._data, dask_array_type): - raise TypeError("this variable's data is stored in a dask array, " - 'which does not support item assignment. To ' - 'assign to this variable, you must first load it ' - 'into memory explicitly using the .load_data() ' - 'method or accessing its .values attribute.') data = broadcasted_indexable(self._data) - data[index_tuple] = value + if isinstance(value, Variable): + data[index_tuple] = value.set_dims(dims) + else: + data[index_tuple] = value @property def attrs(self): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 54adf210519..78fec3c383a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -747,6 +747,26 @@ def test_getitem_advanced(self): self.assertTrue(v_new.dims == ('x', 'y')) self.assertArrayEqual(v_new, v._data[[0, 1]]) + # with mixed arguments + ind = Variable(['a'], [0, 1]) + v_new = v[dict(x=[0, 1], y=ind)] + self.assertTrue(v_new.dims == ('x', 'a')) + self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) + + # boolean indexing + v_new = v[dict(x=[True, False], y=[False, True])] + self.assertTrue(v_new.dims == ('x', 'y')) + self.assertArrayEqual(v_new, v.load()._data[0][1]) + + ind = Variable(['a'], [True, False]) + v_new = v[dict(y=ind)] + self.assertTrue(v_new.dims == ('x', 'a')) + self.assertArrayEqual(v_new, v.load()._data[:, 0:1]) + + def test_getitem_fancy(self): + # Note This fancy getitem is not supported by dask-based Variable. + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) v_new = v[ind] self.assertTrue(v_new.dims == ('a', 'b', 'y')) @@ -757,12 +777,6 @@ def test_getitem_advanced(self): self.assertTrue(v_new.dims == ('x', 'a', 'b')) self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) - # with mixed arguments - ind = Variable(['a'], [0, 1]) - v_new = v[dict(x=[0, 1], y=ind)] - self.assertTrue(v_new.dims == ('x', 'a')) - self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=[1, 0], y=ind)] self.assertTrue(v_new.dims == ('x', 'a', 'b')) @@ -803,6 +817,35 @@ def test_getitem_error(self): ind_y = Variable(['a'], [0]) v[(ind_x, ind_y)] + with self.assertRaisesRegexp(IndexError, "2-dimensional boolean"): + ind = Variable(['a', 'b'], [[True, False], [False, True]]) + v[dict(x=ind)] + + def test_setitem(self): + v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[0, 1] = 1 + self.assertTrue(v[0, 1] == 1) + + v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[0, 1])] = 1 + self.assertArrayEqual(v[[0, 1]], np.ones_like(v[[0, 1]])) + + # boolean indexing + v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[True, False])] = 1 + + self.assertArrayEqual(v[0], np.ones_like(v[0])) + v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[True, False], y=[False, True])] = 1 + self.assertTrue(v[0, 1] == 1) + + # dimension broadcast + v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + ind = Variable(['a'], [0, 1]) + v[dict(x=ind)] = Variable(['a', 'y'], np.ones((2, 3), dtype=int) * 10) + self.assertArrayEqual(v[0], np.ones_like(v[0]) * 10) + self.assertArrayEqual(v[1], np.ones_like(v[0]) * 10) + def test_isel(self): v = Variable(['time', 'x'], self.d) self.assertVariableIdentical(v.isel(time=slice(None)), v) @@ -1205,6 +1248,14 @@ def test_copy_index(self): def test_eq_all_dtypes(self): super(TestVariable_withDask, self).test_eq_all_dtypes() + @pytest.mark.xfail + def test_getitem_fancy(self): + super(TestVariable_withDask, self).test_getitem_fancy() + + @pytest.mark.xfail + def test_setitem(self): + super(TestVariable_withDask, self).test_setitem() + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From 08e74448800f0bf30abf280c6a79d90b896b8fc0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 17 Jul 2017 16:41:03 +0900 Subject: [PATCH 010/113] use `np.arange(*slice.indices(size))` rather than `np.arange(size)[slice]`. --- xarray/core/variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 1c478aa8aca..dc3cdef2ee9 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -425,7 +425,7 @@ def _broadcast_indexes_advanced(self, key): for dim, value in zip(self.dims, key): if isinstance(value, slice): - value = np.arange(self.sizes[dim])[value] + value = np.arange(*value.indices(self.sizes[dim])) try: variable = as_variable(value, name=dim) From 7b3326970286195cb25fc634a5a0a3b90b3c9a5d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 20 Jul 2017 19:12:12 +0900 Subject: [PATCH 011/113] Add orthogonalize_indexers --- xarray/core/indexing.py | 79 ++++++++++++++++++++++++++++------- xarray/tests/test_indexing.py | 48 +++++++++++++++++++++ 2 files changed, 112 insertions(+), 15 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f5bf17d0a52..5e9c6ddf3c7 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -42,6 +42,7 @@ def expanded_indexer(key, ndim): return tuple(new_key) +# TODO should be deprecated def canonicalize_indexer(key, ndim): """Given an indexer for orthogonal array indexing, return an indexer that is a tuple composed entirely of slices, integer ndarrays and native python @@ -107,6 +108,7 @@ def maybe_convert_to_slice(indexer, size): return indexer +# TODO should be deprecated def orthogonal_indexer(key, shape): """Given a key for orthogonal array indexing, returns an equivalent key suitable for indexing a numpy.ndarray with fancy indexing. @@ -469,6 +471,63 @@ def __setitem__(self, key, value): self.array[key] = value +def orthogonalize_indexers(key, shape): + """ + Convert broadcasted indexers to orthogonal indexers. + There is no valid mapping, raises IndexError. + + key is usually generated by Variable._broadcast_indexes. + + key: tuple of np.ndarray, ndarray, slice, integer + shape: shape of array + """ + if all(isinstance(k, integer_types + (slice,)) for k in key): + # basic indexing + return key + + # Here, we need to distinguish integer and 1-element array + # (key[0].shape, key[1].shape, key[2].shape) = + # ((4, 1), (1, 1), (1, 3)) -> result_shape = [4, 3] + # ((4, 1, 1), (1, 1, 1), (1, 1, 3)) -> result_shape = [4, 1, 3] + result_shape = [1] * key[0].ndim + for k in key: + if k.size != 1: + try: + result_shape[k.shape.index(k.size)] = k.size + except ValueError: + raise IndexError( + "Indexer cannot be orthogonalized: {}".format(k)) + # sanity check + dims = [k.shape.index(k.size) for k in key] + if dims[0] != 0: + raise IndexError("Indexer cannot be orthogonalized: {}".format(key)) + while len(dims) > 0: # remove 0 + try: + dims.remove(0) + except ValueError: + break + for i in range(1, len(dims)): + if dims[i-1] > dims[i]: # dims should be increasing + raise IndexError("Indexer cannot be orthogonalized:{}".format(key)) + + cursor = 0 # index moving in result_shape + key = list(key) + for i, k in enumerate(key): + if (k.size == 1 and + (len(result_shape) <= cursor or result_shape[cursor] != 1)): + # integer + key[i] = k.item() + else: + key[i] = np.ravel(k) + cursor += 1 + key = tuple(maybe_convert_to_slice(k, size) if hasattr(k, 'dim') else k + for k, size in zip(key, shape)) + return key + + #except ValueError: + # raise IndexError("Indexer cannot be orthogonalized: {}".format(key)) + + def broadcasted_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) @@ -496,6 +555,7 @@ def _ensure_ndarray(self, value): def __getitem__(self, key): return self._ensure_ndarray(self.array[key]) + def __setitem__(self, key, value): self.array[key] = value @@ -510,21 +570,10 @@ def __init__(self, array): self.array = array def _broadcast_indexes(self, key): - """ key: tuple of ndarray, slice, integer """ - if all(isinstance(k, integer_types + (slice,)) for k in key): - # basic indexing - return key - elif all(k.shape == - (1,) * i + (max(k.shape),) + (1,) * (k.ndim - i - 1) - for i, k in enumerate(key) if hasattr(k, 'shape')): - # orthogonal indexing - # dask only supports one list in an indexer, so convert to slice if - # possible - key = tuple(maybe_convert_to_slice(np.ravel(k), size) - for k, size in zip(key, self.shape)) - return key - # TODO: handle point-wise indexing with vindex - else: + try: + return orthogonalize_indexers(key, self.shape) + # TODO: handle point-wise indexing with vindex + except IndexError: raise IndexError( 'dask does not support fancy indexing with key: {}'.format(key)) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 6d478a68b5f..dc66fea985e 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -3,6 +3,7 @@ from __future__ import print_function import numpy as np import pandas as pd +import pytest from xarray import Dataset, DataArray, Variable from xarray.core import indexing @@ -203,6 +204,7 @@ def test_slice_slice(self): actual = x[new_slice] self.assertArrayEqual(expected, actual) + @pytest.mark.xfail def test_lazily_indexed_array(self): x = indexing.NumpyIndexingAdapter(np.random.rand(10, 20, 30)) lazy = indexing.LazilyIndexedArray(x) @@ -278,3 +280,49 @@ def test_index_scalar(self): # regression test for GH1374 x = indexing.MemoryCachedArray(np.array(['foo', 'bar'])) assert np.array(x[0][()]) == 'foo' + + +class Test_orthogonalize_indexers(TestCase): + def assert1dIndexEqual(self, x, y, size): + """ Compare 1d vector, slice, array """ + def vectorize(array): + if isinstance(array, slice): # slice + return np.arange(*array.indices(size)) + if hasattr(array, 'dtype') and array.dtype.kind == 'b': + # boolean array + return np.arange(len(array))[array] + return np.arange(size)[array] + + self.assertArrayEqual(vectorize(x), vectorize(y)) + self.assertArrayEqual(vectorize(x).shape, vectorize(y).shape) + + def test(self): + original = np.random.rand(10, 20, 30) + v = Variable(('i', 'j', 'k'), original) + I = ReturnItem() + # test broadcasted indexers + indexers = [I[:], 0, -2, I[:3], [4, 1, 2, 3], [0], np.arange(10) < 5] + for i in indexers: + for j in indexers: + for k in indexers: + dims, indexer = v._broadcast_indexes((i, j, k)) + orthogonalized = indexing.orthogonalize_indexers( + indexer, v.shape) + dim_new, indexer_new = v._broadcast_indexes(orthogonalized) + + self.assertArrayEqual(original[indexer], + original[indexer_new]) + orthogonalized_new = indexing.orthogonalize_indexers( + indexer_new, v.shape) + self.assertArrayEqual(orthogonalized[0], + orthogonalized_new[0]) + self.assertArrayEqual(orthogonalized[0], + orthogonalized_new[0]) + + def test_error(self): + with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): + indexing.orthogonalize_indexers((np.ones((2, 2)), np.ones((2, 1))), + shape=(3, 2)) + with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): + indexing.orthogonalize_indexers((np.ones((1, 2)), np.ones((2, 1))), + shape=(3, 2)) From 50ea56e25ff50d6217b9d1e90afab72cd52f4cfa Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 20 Jul 2017 21:43:27 +0900 Subject: [PATCH 012/113] A bug fix. --- xarray/core/indexing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 5e9c6ddf3c7..edd36cf055a 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -520,13 +520,11 @@ def orthogonalize_indexers(key, shape): else: key[i] = np.ravel(k) cursor += 1 - key = tuple(maybe_convert_to_slice(k, size) if hasattr(k, 'dim') else k + key = tuple(k if isinstance(k, integer_types) + else maybe_convert_to_slice(k, size) for k, size in zip(key, shape)) return key - #except ValueError: - # raise IndexError("Indexer cannot be orthogonalized: {}".format(key)) - def broadcasted_indexable(array): if isinstance(array, np.ndarray): From bac0089f6afb14d00ee5a1a5bd4448a2e1ab159e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 20 Jul 2017 21:57:42 +0900 Subject: [PATCH 013/113] Working with LazilyIndexedArray --- xarray/core/indexing.py | 11 ++++ xarray/tests/test_indexing.py | 105 +++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index edd36cf055a..4f767b4ee26 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -526,6 +526,17 @@ def orthogonalize_indexers(key, shape): return key +class BroadcastedIndexingAdapter(utils.NDArrayMixin): + """ An array wrapper for orthogonally indexed arrays, such as netCDF. """ + def __init__(self, array): + self.array = array + + def __getitem__(self, key): + key = expanded_indexer(key, self.ndim) + key = orthogonalize_indexers(key, self.shape) + return self.array[key] + + def broadcasted_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index dc66fea985e..da374af8331 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -6,10 +6,88 @@ import pytest from xarray import Dataset, DataArray, Variable -from xarray.core import indexing +from xarray.core import indexing, utils from . import TestCase, ReturnItem +class NumpyOrthogonalIndexingAdapter(utils.NDArrayMixin): + """Wrap a NumPy array to use orthogonal indexing (array indexing + accesses different dimensions independently, like netCDF4-python variables) + """ + # note: this object is somewhat similar to biggus.NumpyArrayAdapter in that + # it implements orthogonal indexing, except it casts to a numpy array, + # isn't lazy and supports writing values. + def __init__(self, array): + self.array = np.asarray(array) + + def __array__(self, dtype=None): + return np.asarray(self.array, dtype=dtype) + + def _convert_key(self, key): + key = indexing.expanded_indexer(key, self.ndim) + if any(not isinstance(k, indexing.integer_types + (slice,)) + for k in key): + # key would trigger fancy indexing + key = indexing.orthogonal_indexer(key, self.shape) + return key + + def _ensure_ndarray(self, value): + # We always want the result of indexing to be a NumPy array. If it's + # not, then it really should be a 0d array. Doing the coercion here + # instead of inside variable.as_compatible_data makes it less error + # prone. + if not isinstance(value, np.ndarray): + value = utils.to_0d_array(value) + return value + + def __getitem__(self, key): + key = self._convert_key(key) + return type(self)(self._ensure_ndarray(self.array[key])) + + def __setitem__(self, key, value): + key = self._convert_key(key) + self.array[key] = value + + +class TestNumpyOrthogonalIndexingAdapter(TestCase): + def test_basic(self): + def maybe_boolean_array(array, size): + """ Map boolean array to size 'size' by appendin False in its tail + """ + if hasattr(array, 'dtype') and array.dtype.kind == 'b': + array_new = np.ndarray(size, dtype='?') + array_new[:array.size] = array + array_new[array.size:] = False + return array_new + return array + + original = np.random.rand(10, 20, 30) + orthogonal = NumpyOrthogonalIndexingAdapter(original) + I = ReturnItem() + # test broadcasted indexers + indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] + for i in indexers: + for j in indexers: + for k in indexers: + actual = orthogonal[i, j, k] + j = maybe_boolean_array(j, 20) + k = maybe_boolean_array(k, 30) + if isinstance(i, int): + if isinstance(j, int): + expected = original[i][j][k] + else: + expected = original[i][j][:, k] + else: + if isinstance(j, int): + expected = original[i][:, j][:, k] + else: + expected = original[i][:, j][:, :, k] + self.assertArrayEqual(actual, expected) + # indivisual testing + self.assertTrue(orthogonal[np.array([0]), :, :].shape == (1, 20, 30)) + self.assertArrayEqual(orthogonal[[0], :, :], original[[0], :, :]) + + class TestIndexers(TestCase): def set_to_zero(self, x, i): x = x.copy() @@ -204,9 +282,8 @@ def test_slice_slice(self): actual = x[new_slice] self.assertArrayEqual(expected, actual) - @pytest.mark.xfail def test_lazily_indexed_array(self): - x = indexing.NumpyIndexingAdapter(np.random.rand(10, 20, 30)) + x = NumpyOrthogonalIndexingAdapter(np.random.rand(10, 20, 30)) lazy = indexing.LazilyIndexedArray(x) I = ReturnItem() # test orthogonally applied indexers @@ -326,3 +403,25 @@ def test_error(self): with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): indexing.orthogonalize_indexers((np.ones((1, 2)), np.ones((2, 1))), shape=(3, 2)) + + +class TestBroadcastedIndexingAdapter(TestCase): + def test_basic(self): + original = np.random.rand(10, 20, 30) + v = Variable(('i', 'j', 'k'), original) + orthogonal = NumpyOrthogonalIndexingAdapter(original) + wrapped = indexing.BroadcastedIndexingAdapter(orthogonal) + I = ReturnItem() + # test broadcasted indexers + indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] + for i in indexers: + for j in indexers: + for k in indexers: + actual_ortho = orthogonal[i, j, k] + dims, indexer = v._broadcast_indexes((i, j, k)) + expected = original[indexer] + actual = wrapped[indexer] + self.assertEqual(expected.shape, actual_ortho.shape) + self.assertArrayEqual(expected, actual_ortho) + self.assertEqual(expected.shape, actual.shape) + self.assertArrayEqual(expected, actual) From 1206c285dfa5386e1861d9a26730c6fdbf222fa7 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 20 Jul 2017 22:29:05 +0900 Subject: [PATCH 014/113] Fix in LazilyIndexedArray. --- xarray/core/indexing.py | 4 +++- xarray/tests/test_indexing.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 4f767b4ee26..10411c7afea 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -388,7 +388,7 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): - new_key = iter(canonicalize_indexer(new_key, self.ndim)) + new_key = iter(orthogonalize_indexers(new_key, self.shape)) key = [] for size, k in zip(self.array.shape, self.key): if isinstance(k, integer_types): @@ -481,6 +481,8 @@ def orthogonalize_indexers(key, shape): key: tuple of np.ndarray, ndarray, slice, integer shape: shape of array """ + key = expanded_indexer(key, len(shape)) + if all(isinstance(k, integer_types + (slice,)) for k in key): # basic indexing return key diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index da374af8331..ed97fb1e73d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -284,25 +284,27 @@ def test_slice_slice(self): def test_lazily_indexed_array(self): x = NumpyOrthogonalIndexingAdapter(np.random.rand(10, 20, 30)) + v = Variable(['i', 'j', 'k'], x.array) lazy = indexing.LazilyIndexedArray(x) + v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # test orthogonally applied indexers indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], np.arange(10) < 5] for i in indexers: for j in indexers: for k in indexers: - expected = np.asarray(x[i, j, k]) - for actual in [lazy[i, j, k], - lazy[:, j, k][i], - lazy[:, :, k][:, j][i]]: + expected = np.asarray(v[i, j, k]) + for actual in [v_lazy[i, j, k], + v_lazy[:, j, k][i], + v_lazy[:, :, k][:, j][i]]: self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) # test sequentially applied indexers indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] for i, j in indexers: - expected = np.asarray(x[i][j]) - actual = lazy[i][j] + expected = np.asarray(v[i][j]) + actual = v_lazy[i][j] self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) From c2747befacc9d159f075014fd258f8b1911b278b Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 20 Jul 2017 22:39:18 +0900 Subject: [PATCH 015/113] add @requires_dask in test_variable --- xarray/tests/test_variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 78fec3c383a..72aa95bbb05 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1224,7 +1224,7 @@ def test_count(self): actual = Variable(['x', 'y'], [[1, 0, np.nan], [1, 1, 1]]).count('y') self.assertVariableIdentical(expected, actual) - +@requires_dask class TestVariable_withDask(TestVariable): cls = staticmethod(lambda *args: Variable(*args).chunk()) From 0671f39598a967756e03eeebd7b7dbe02b5a061e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 21 Jul 2017 21:14:47 +0900 Subject: [PATCH 016/113] rename orthogonalize_indexers -> unbroadcast_indexers --- xarray/core/indexing.py | 14 +++++++------- xarray/tests/test_indexing.py | 15 +++++++-------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 10411c7afea..6b8d5842838 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -388,7 +388,7 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): - new_key = iter(orthogonalize_indexers(new_key, self.shape)) + new_key = iter(unbroadcast_indexers(new_key, self.shape)) key = [] for size, k in zip(self.array.shape, self.key): if isinstance(k, integer_types): @@ -471,14 +471,14 @@ def __setitem__(self, key, value): self.array[key] = value -def orthogonalize_indexers(key, shape): +def unbroadcast_indexers(key, shape): """ Convert broadcasted indexers to orthogonal indexers. - There is no valid mapping, raises IndexError. + If there is no valid mapping, raises IndexError. key is usually generated by Variable._broadcast_indexes. - key: tuple of np.ndarray, ndarray, slice, integer + key: tuple of np.ndarray, slice, integer shape: shape of array """ key = expanded_indexer(key, len(shape)) @@ -488,7 +488,7 @@ def orthogonalize_indexers(key, shape): return key # Here, we need to distinguish integer and 1-element array - # (key[0].shape, key[1].shape, key[2].shape) = + # E.G. If (key[0].shape, key[1].shape, key[2].shape) is # ((4, 1), (1, 1), (1, 3)) -> result_shape = [4, 3] # ((4, 1, 1), (1, 1, 1), (1, 1, 3)) -> result_shape = [4, 1, 3] result_shape = [1] * key[0].ndim @@ -535,7 +535,7 @@ def __init__(self, array): def __getitem__(self, key): key = expanded_indexer(key, self.ndim) - key = orthogonalize_indexers(key, self.shape) + key = unbroadcast_indexers(key, self.shape) return self.array[key] @@ -582,7 +582,7 @@ def __init__(self, array): def _broadcast_indexes(self, key): try: - return orthogonalize_indexers(key, self.shape) + return unbroadcast_indexers(key, self.shape) # TODO: handle point-wise indexing with vindex except IndexError: raise IndexError( diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index ed97fb1e73d..73238d9f4ca 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -3,7 +3,6 @@ from __future__ import print_function import numpy as np import pandas as pd -import pytest from xarray import Dataset, DataArray, Variable from xarray.core import indexing, utils @@ -361,7 +360,7 @@ def test_index_scalar(self): assert np.array(x[0][()]) == 'foo' -class Test_orthogonalize_indexers(TestCase): +class Test_unbroadcast_indexers(TestCase): def assert1dIndexEqual(self, x, y, size): """ Compare 1d vector, slice, array """ def vectorize(array): @@ -385,13 +384,13 @@ def test(self): for j in indexers: for k in indexers: dims, indexer = v._broadcast_indexes((i, j, k)) - orthogonalized = indexing.orthogonalize_indexers( + orthogonalized = indexing.unbroadcast_indexers( indexer, v.shape) dim_new, indexer_new = v._broadcast_indexes(orthogonalized) self.assertArrayEqual(original[indexer], original[indexer_new]) - orthogonalized_new = indexing.orthogonalize_indexers( + orthogonalized_new = indexing.unbroadcast_indexers( indexer_new, v.shape) self.assertArrayEqual(orthogonalized[0], orthogonalized_new[0]) @@ -400,11 +399,11 @@ def test(self): def test_error(self): with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.orthogonalize_indexers((np.ones((2, 2)), np.ones((2, 1))), - shape=(3, 2)) + indexing.unbroadcast_indexers((np.ones((2, 2)), np.ones((2, 1))), + shape=(3, 2)) with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.orthogonalize_indexers((np.ones((1, 2)), np.ones((2, 1))), - shape=(3, 2)) + indexing.unbroadcast_indexers((np.ones((1, 2)), np.ones((2, 1))), + shape=(3, 2)) class TestBroadcastedIndexingAdapter(TestCase): From ffccff1241470d822d42c7237e774872dd1414c2 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 21 Jul 2017 22:12:54 +0900 Subject: [PATCH 017/113] Wrap LazilyIndexedArray so that it accepts broadcasted-indexers --- xarray/core/indexing.py | 42 ++++++++++++++++++++++++++++++----- xarray/tests/test_indexing.py | 8 ++++--- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6b8d5842838..777965ebf6c 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -369,8 +369,11 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer -class LazilyIndexedArray(utils.NDArrayMixin): +class OrthogonalLazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy + + This is array is indexed by orthogonal-indexing. For using broadcasted + indexers, use LazilyIndexedArray. """ def __init__(self, array, key=None): """ @@ -388,7 +391,7 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): - new_key = iter(unbroadcast_indexers(new_key, self.shape)) + new_key = iter(new_key) key = [] for size, k in zip(self.array.shape, self.key): if isinstance(k, integer_types): @@ -408,10 +411,10 @@ def shape(self): return tuple(shape) def __array__(self, dtype=None): - array = broadcasted_indexable(self.array) - return np.asarray(array[self.key], dtype=None) + return np.asarray(self.array[self.key], dtype=None) def __getitem__(self, key): + key = expanded_indexer(key, self.ndim) return type(self)(self.array, self._updated_key(key)) def __setitem__(self, key, value): @@ -423,6 +426,24 @@ def __repr__(self): (type(self).__name__, self.array, self.key)) +class LazilyIndexedArray(utils.NDArrayMixin): + """ Wrap an array that handles orthogonal indexing to make indexing lazy + + This is LazilyIndexedArray is indexed by broadcaseted-indexing. + For using orthogonal indexers, use OrthogonalLazilyIndexedArray. + """ + def __init__(self, array, key=None): + self.array = BroadcastedIndexingAdapter( + OrthogonalLazilyIndexedArray(array, key)) + + @property + def shape(self): + return self.array.array.shape + + def __repr__(self): + return self.array.array.__repr__() + + def _wrap_numpy_scalars(array): """Wrap NumPy scalars in 0d arrays.""" if np.isscalar(array): @@ -529,14 +550,23 @@ def unbroadcast_indexers(key, shape): class BroadcastedIndexingAdapter(utils.NDArrayMixin): - """ An array wrapper for orthogonally indexed arrays, such as netCDF. """ + """ An array wrapper for orthogonally indexed arrays, such as netCDF + in order to indexed by broadcasted indexers. """ def __init__(self, array): self.array = array + def __array__(self, dtype=None): + return np.asarray(self.array, dtype=dtype) + def __getitem__(self, key): key = expanded_indexer(key, self.ndim) key = unbroadcast_indexers(key, self.shape) - return self.array[key] + return type(self)(self.array[key]) + + def __setitem__(self, key, value): + key = expanded_indexer(key, self.ndim) + key = unbroadcast_indexers(key, self.shape) + self.array[key] = value def broadcasted_indexable(array): diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 73238d9f4ca..da8eb35fd8b 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -282,13 +282,13 @@ def test_slice_slice(self): self.assertArrayEqual(expected, actual) def test_lazily_indexed_array(self): - x = NumpyOrthogonalIndexingAdapter(np.random.rand(10, 20, 30)) - v = Variable(['i', 'j', 'k'], x.array) + x = np.random.rand(10, 20, 30) + v = Variable(['i', 'j', 'k'], x) lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # test orthogonally applied indexers - indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], np.arange(10) < 5] + indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] for i in indexers: for j in indexers: for k in indexers: @@ -426,3 +426,5 @@ def test_basic(self): self.assertArrayEqual(expected, actual_ortho) self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) + self.assertTrue(type(actual), + indexing.BroadcastedIndexingAdapter) From becf539c6eb662287242b33e7f3eb4347944d565 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 21 Jul 2017 22:25:06 +0900 Subject: [PATCH 018/113] small rename --- xarray/core/indexing.py | 8 ++++---- xarray/tests/test_indexing.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 777965ebf6c..c9e0cd73d40 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -492,7 +492,7 @@ def __setitem__(self, key, value): self.array[key] = value -def unbroadcast_indexers(key, shape): +def unbroadcast_indexes(key, shape): """ Convert broadcasted indexers to orthogonal indexers. If there is no valid mapping, raises IndexError. @@ -560,12 +560,12 @@ def __array__(self, dtype=None): def __getitem__(self, key): key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexers(key, self.shape) + key = unbroadcast_indexes(key, self.shape) return type(self)(self.array[key]) def __setitem__(self, key, value): key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexers(key, self.shape) + key = unbroadcast_indexes(key, self.shape) self.array[key] = value @@ -612,7 +612,7 @@ def __init__(self, array): def _broadcast_indexes(self, key): try: - return unbroadcast_indexers(key, self.shape) + return unbroadcast_indexes(key, self.shape) # TODO: handle point-wise indexing with vindex except IndexError: raise IndexError( diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index da8eb35fd8b..40a4bb32c9f 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -360,7 +360,7 @@ def test_index_scalar(self): assert np.array(x[0][()]) == 'foo' -class Test_unbroadcast_indexers(TestCase): +class Test_unbroadcast_indexes(TestCase): def assert1dIndexEqual(self, x, y, size): """ Compare 1d vector, slice, array """ def vectorize(array): @@ -384,13 +384,13 @@ def test(self): for j in indexers: for k in indexers: dims, indexer = v._broadcast_indexes((i, j, k)) - orthogonalized = indexing.unbroadcast_indexers( + orthogonalized = indexing.unbroadcast_indexes( indexer, v.shape) dim_new, indexer_new = v._broadcast_indexes(orthogonalized) self.assertArrayEqual(original[indexer], original[indexer_new]) - orthogonalized_new = indexing.unbroadcast_indexers( + orthogonalized_new = indexing.unbroadcast_indexes( indexer_new, v.shape) self.assertArrayEqual(orthogonalized[0], orthogonalized_new[0]) @@ -399,10 +399,10 @@ def test(self): def test_error(self): with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.unbroadcast_indexers((np.ones((2, 2)), np.ones((2, 1))), + indexing.unbroadcast_indexes((np.ones((2, 2)), np.ones((2, 1))), shape=(3, 2)) with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.unbroadcast_indexers((np.ones((1, 2)), np.ones((2, 1))), + indexing.unbroadcast_indexes((np.ones((1, 2)), np.ones((2, 1))), shape=(3, 2)) From 1ae4b4cbab7db146ba6e3c27a02daf30500150ea Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 21 Jul 2017 22:45:06 +0900 Subject: [PATCH 019/113] Another small fix --- xarray/core/indexing.py | 18 +++++++++--------- xarray/tests/test_indexing.py | 11 ++++++----- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c9e0cd73d40..060f400f9fa 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -42,7 +42,6 @@ def expanded_indexer(key, ndim): return tuple(new_key) -# TODO should be deprecated def canonicalize_indexer(key, ndim): """Given an indexer for orthogonal array indexing, return an indexer that is a tuple composed entirely of slices, integer ndarrays and native python @@ -433,7 +432,7 @@ class LazilyIndexedArray(utils.NDArrayMixin): For using orthogonal indexers, use OrthogonalLazilyIndexedArray. """ def __init__(self, array, key=None): - self.array = BroadcastedIndexingAdapter( + self.array = BroadcastIndexedAdapter( OrthogonalLazilyIndexedArray(array, key)) @property @@ -543,13 +542,10 @@ def unbroadcast_indexes(key, shape): else: key[i] = np.ravel(k) cursor += 1 - key = tuple(k if isinstance(k, integer_types) - else maybe_convert_to_slice(k, size) - for k, size in zip(key, shape)) - return key + return tuple(key) -class BroadcastedIndexingAdapter(utils.NDArrayMixin): +class BroadcastIndexedAdapter(utils.NDArrayMixin): """ An array wrapper for orthogonally indexed arrays, such as netCDF in order to indexed by broadcasted indexers. """ def __init__(self, array): @@ -612,9 +608,13 @@ def __init__(self, array): def _broadcast_indexes(self, key): try: - return unbroadcast_indexes(key, self.shape) - # TODO: handle point-wise indexing with vindex + key = unbroadcast_indexes(key, self.shape) + return tuple(k if isinstance(k, (integer_types, slice)) + else maybe_convert_to_slice(k, size) + for k, size in zip(key, self.shape)) + except IndexError: + # TODO: handle point-wise indexing with vindex raise IndexError( 'dask does not support fancy indexing with key: {}'.format(key)) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 40a4bb32c9f..b2c06a31766 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -282,8 +282,9 @@ def test_slice_slice(self): self.assertArrayEqual(expected, actual) def test_lazily_indexed_array(self): - x = np.random.rand(10, 20, 30) - v = Variable(['i', 'j', 'k'], x) + original = np.random.rand(10, 20, 30) + x = NumpyOrthogonalIndexingAdapter(original) + v = Variable(['i', 'j', 'k'], original) lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() @@ -406,12 +407,12 @@ def test_error(self): shape=(3, 2)) -class TestBroadcastedIndexingAdapter(TestCase): +class TestBroadcastIndexedAdapter(TestCase): def test_basic(self): original = np.random.rand(10, 20, 30) v = Variable(('i', 'j', 'k'), original) orthogonal = NumpyOrthogonalIndexingAdapter(original) - wrapped = indexing.BroadcastedIndexingAdapter(orthogonal) + wrapped = indexing.BroadcastIndexedAdapter(orthogonal) I = ReturnItem() # test broadcasted indexers indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] @@ -427,4 +428,4 @@ def test_basic(self): self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) self.assertTrue(type(actual), - indexing.BroadcastedIndexingAdapter) + indexing.BroadcastIndexedAdapter) From 1967bf5cafb3e3bce58228f9c299de274c3b0118 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 22 Jul 2017 08:30:33 +0900 Subject: [PATCH 020/113] Remove unused function. --- xarray/core/indexing.py | 11 +++++------ xarray/tests/test_indexing.py | 13 ------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 060f400f9fa..768b94343e2 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -542,7 +542,10 @@ def unbroadcast_indexes(key, shape): else: key[i] = np.ravel(k) cursor += 1 - return tuple(key) + + return tuple(k if isinstance(k, (integer_types, slice)) + else maybe_convert_to_slice(k, size) + for k, size in zip(key, shape)) class BroadcastIndexedAdapter(utils.NDArrayMixin): @@ -608,11 +611,7 @@ def __init__(self, array): def _broadcast_indexes(self, key): try: - key = unbroadcast_indexes(key, self.shape) - return tuple(k if isinstance(k, (integer_types, slice)) - else maybe_convert_to_slice(k, size) - for k, size in zip(key, self.shape)) - + return unbroadcast_indexes(key, self.shape) except IndexError: # TODO: handle point-wise indexing with vindex raise IndexError( diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index b2c06a31766..a6f0ae03682 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -362,19 +362,6 @@ def test_index_scalar(self): class Test_unbroadcast_indexes(TestCase): - def assert1dIndexEqual(self, x, y, size): - """ Compare 1d vector, slice, array """ - def vectorize(array): - if isinstance(array, slice): # slice - return np.arange(*array.indices(size)) - if hasattr(array, 'dtype') and array.dtype.kind == 'b': - # boolean array - return np.arange(len(array))[array] - return np.arange(size)[array] - - self.assertArrayEqual(vectorize(x), vectorize(y)) - self.assertArrayEqual(vectorize(x).shape, vectorize(y).shape) - def test(self): original = np.random.rand(10, 20, 30) v = Variable(('i', 'j', 'k'), original) From c2eeff365402f3c2bff83d7f4e86e879c0b4e734 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 23 Jul 2017 20:29:33 +0900 Subject: [PATCH 021/113] Added _broadcast_indexes_1vector --- xarray/core/indexing.py | 61 ++++++++++++++--------------------------- xarray/core/variable.py | 18 ++++++++++-- 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 768b94343e2..a226f4f2df6 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -507,45 +507,19 @@ def unbroadcast_indexes(key, shape): # basic indexing return key - # Here, we need to distinguish integer and 1-element array - # E.G. If (key[0].shape, key[1].shape, key[2].shape) is - # ((4, 1), (1, 1), (1, 3)) -> result_shape = [4, 3] - # ((4, 1, 1), (1, 1, 1), (1, 1, 3)) -> result_shape = [4, 1, 3] - result_shape = [1] * key[0].ndim + i_dim = 0 + orthogonal_keys = [] for k in key: - if k.size != 1: - try: - result_shape[k.shape.index(k.size)] = k.size - except ValueError: + if hasattr(k, 'shape'): # array + if k.shape[i_dim] != k.size: raise IndexError( "Indexer cannot be orthogonalized: {}".format(k)) - # sanity check - dims = [k.shape.index(k.size) for k in key] - if dims[0] != 0: - raise IndexError("Indexer cannot be orthogonalized: {}".format(key)) - while len(dims) > 0: # remove 0 - try: - dims.remove(0) - except ValueError: - break - for i in range(1, len(dims)): - if dims[i-1] > dims[i]: # dims should be increasing - raise IndexError("Indexer cannot be orthogonalized:{}".format(key)) - - cursor = 0 # index moving in result_shape - key = list(key) - for i, k in enumerate(key): - if (k.size == 1 and - (len(result_shape) <= cursor or result_shape[cursor] != 1)): - # integer - key[i] = k.item() - else: - key[i] = np.ravel(k) - cursor += 1 - - return tuple(k if isinstance(k, (integer_types, slice)) - else maybe_convert_to_slice(k, size) - for k, size in zip(key, shape)) + else: + i_dim += 1 + orthogonal_keys.append(np.ravel(k)) + else: # integer + orthogonal_keys.append(k) + return tuple(orthogonal_keys) class BroadcastIndexedAdapter(utils.NDArrayMixin): @@ -609,20 +583,27 @@ def __init__(self, array): """ self.array = array - def _broadcast_indexes(self, key): + def _orthogonalize_indexes(self, key): try: - return unbroadcast_indexes(key, self.shape) + key = unbroadcast_indexes(key, self.shape) + # convert them to slice if possible + return tuple(k if isinstance(k, (integer_types, slice)) + else maybe_convert_to_slice(k, size) + for k, size in zip(key, self.shape)) + except IndexError: # TODO: handle point-wise indexing with vindex raise IndexError( 'dask does not support fancy indexing with key: {}'.format(key)) def __getitem__(self, key): - key = self._broadcast_indexes(key) + key = self._orthogonalize_indexes(key) + # TODO any orthogonalized key can be indexed recursively. + # TODO support vindex return self.array[key] def __setitem__(self, key, value): - key = self._broadcast_indexes(key) + key = self._orthogonalize_indexes(key) raise TypeError("this variable's data is stored in a dask array, " 'which does not support item assignment. To ' 'assign to this variable, you must first load it ' diff --git a/xarray/core/variable.py b/xarray/core/variable.py index dc3cdef2ee9..0fbe1d63834 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -405,13 +405,25 @@ def _broadcast_indexes(self, key): if all([isinstance(k, basic_indexing_types) for k in key]): return self._broadcast_indexes_basic(key) else: - return self._broadcast_indexes_advanced(key) + vindexes = [np.asarray(k) for k in key if + not isinstance(k, integer_types + (slice,))] + # slices and only one vector, no integers. + if (len(vindexes) == 0 and vindexes[0].ndim == 1 and + sum([isinstance(k, slice) for k in key]) == len(key)-1): + return self._broadcast_indexes_1vector(key) + else: # fancy indexing + return self._broadcast_indexes_advanced(key) def _broadcast_indexes_basic(self, key): dims = tuple(dim for k, dim in zip(key, self.dims) if not isinstance(k, integer_types)) return dims, key + def _broadcast_indexes_1vector(self, key): + dims = tuple(key.dims[0] if hasattr(k, 'dims') else dim + for k, dim in zip(key, self.dims)) + return dims, key + def nonzero(self): """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ # TODO we should replace dask's native nonzero @@ -442,7 +454,9 @@ def _broadcast_indexes_advanced(self, key): variables.append(variable) variables = _broadcast_compat_variables(*variables) dims = variables[0].dims # all variables have the same dims - key = tuple(variable.data for variable in variables) + # overwrite if there is integers + key = tuple(k if isinstance(k, integer_types) else variable.data + for variable, k in zip(variables, key)) return dims, key def __getitem__(self, key): From df12c040f6138cfe017456b9b14d2737e466b746 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 23 Jul 2017 21:38:20 +0900 Subject: [PATCH 022/113] Minor fix --- xarray/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a226f4f2df6..b1a41c45c75 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -510,7 +510,7 @@ def unbroadcast_indexes(key, shape): i_dim = 0 orthogonal_keys = [] for k in key: - if hasattr(k, 'shape'): # array + if hasattr(k, '__len__'): # array if k.shape[i_dim] != k.size: raise IndexError( "Indexer cannot be orthogonalized: {}".format(k)) From 5ba367d75e5e2a22ff210b63d7b0d09578b4d7a0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 23 Jul 2017 22:53:12 +0900 Subject: [PATCH 023/113] Avoid doubly wrapping by LazilyIndexedArray --- xarray/core/indexing.py | 41 +++++++++++++++-------------------- xarray/tests/test_backends.py | 2 +- xarray/tests/test_indexing.py | 6 +++++ 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index b1a41c45c75..47f39be0fe1 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -368,10 +368,10 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer -class OrthogonalLazilyIndexedArray(utils.NDArrayMixin): +class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy - This is array is indexed by orthogonal-indexing. For using broadcasted + This is array is indexed by broadcasted-indexing. For using broadcasted indexers, use LazilyIndexedArray. """ def __init__(self, array, key=None): @@ -384,10 +384,18 @@ def __init__(self, array, key=None): Array indexer. If provided, it is assumed to already be in canonical expanded form. """ - if key is None: - key = (slice(None),) * array.ndim - self.array = array - self.key = key + # We need to ensure that self.array is not LazilyIndexedArray, + # because LazilyIndexedArray is not orthogonaly indexable + if isinstance(array, type(self)): + self.array = array.array + self.key = array.key + if key is not None: + self.key = self._updated_key(key) + else: + if key is None: + key = (slice(None),) * array.ndim + self.array = array + self.key = key def _updated_key(self, new_key): new_key = iter(new_key) @@ -414,9 +422,12 @@ def __array__(self, dtype=None): def __getitem__(self, key): key = expanded_indexer(key, self.ndim) + key = unbroadcast_indexes(key, self.shape) return type(self)(self.array, self._updated_key(key)) def __setitem__(self, key, value): + key = expanded_indexer(key, self.ndim) + key = unbroadcast_indexes(key, self.shape) key = self._updated_key(key) self.array[key] = value @@ -425,24 +436,6 @@ def __repr__(self): (type(self).__name__, self.array, self.key)) -class LazilyIndexedArray(utils.NDArrayMixin): - """ Wrap an array that handles orthogonal indexing to make indexing lazy - - This is LazilyIndexedArray is indexed by broadcaseted-indexing. - For using orthogonal indexers, use OrthogonalLazilyIndexedArray. - """ - def __init__(self, array, key=None): - self.array = BroadcastIndexedAdapter( - OrthogonalLazilyIndexedArray(array, key)) - - @property - def shape(self): - return self.array.array.shape - - def __repr__(self): - return self.array.array.__repr__() - - def _wrap_numpy_scalars(array): """Wrap NumPy scalars in 0d arrays.""" if np.isscalar(array): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fa6fff9f4b..f4ae37079f3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -319,7 +319,7 @@ def test_roundtrip_boolean_dtype(self): def test_orthogonal_indexing(self): in_memory = create_test_data() with self.roundtrip(in_memory) as on_disk: - indexers = {'dim1': np.arange(3), 'dim2': np.arange(4), + indexers = {'dim1': [1, 2, 0], 'dim2': [3, 2, 0, 3], 'dim3': np.arange(5)} expected = in_memory.isel(**indexers) actual = on_disk.isel(**indexers) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index a6f0ae03682..d2ce2172c33 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -299,6 +299,8 @@ def test_lazily_indexed_array(self): v_lazy[:, :, k][:, j][i]]: self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) + self.assertTrue(isinstance( + actual._data, indexing.LazilyIndexedArray)) # test sequentially applied indexers indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] @@ -307,6 +309,10 @@ def test_lazily_indexed_array(self): actual = v_lazy[i][j] self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) + self.assertTrue(isinstance( + actual._data, indexing.LazilyIndexedArray)) + self.assertTrue(isinstance(actual._data.array, + NumpyOrthogonalIndexingAdapter)) class TestCopyOnWriteArray(TestCase): From d25c1f1cbd28ec52ae1882138231732b9818df05 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 24 Jul 2017 08:37:54 +0900 Subject: [PATCH 024/113] General orthogonal indexing for dask array. --- xarray/core/indexing.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 47f39be0fe1..41e8e246dd8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -577,24 +577,28 @@ def __init__(self, array): self.array = array def _orthogonalize_indexes(self, key): - try: - key = unbroadcast_indexes(key, self.shape) - # convert them to slice if possible - return tuple(k if isinstance(k, (integer_types, slice)) - else maybe_convert_to_slice(k, size) - for k, size in zip(key, self.shape)) + key = unbroadcast_indexes(key, self.shape) + # convert them to slice if possible + return tuple(k if isinstance(k, (integer_types, slice)) + else maybe_convert_to_slice(k, size) + for k, size in zip(key, self.shape)) + def __getitem__(self, key): + try: + key = self._orthogonalize_indexes(key) + try: + return self.array[key] + except NotImplementedError: + # manual orthogonal indexing. + value = self.array + for axis, subkey in reversed(list(enumerate(key))): + value = value[(slice(None),) * axis + (subkey,)] + return value except IndexError: - # TODO: handle point-wise indexing with vindex + # TODO should support vindex raise IndexError( 'dask does not support fancy indexing with key: {}'.format(key)) - def __getitem__(self, key): - key = self._orthogonalize_indexes(key) - # TODO any orthogonalized key can be indexed recursively. - # TODO support vindex - return self.array[key] - def __setitem__(self, key, value): key = self._orthogonalize_indexes(key) raise TypeError("this variable's data is stored in a dask array, " From 0115994ff4f4b73087ab5bc481c03eef9c4fde5e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 24 Jul 2017 19:35:27 +0900 Subject: [PATCH 025/113] Added base class IndexableArrayAdapter --- xarray/backends/scipy_.py | 6 ++-- xarray/core/indexing.py | 59 +++++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 0c15085c618..8c27a532bd7 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -11,7 +11,7 @@ from ..core.pycompat import iteritems, OrderedDict, basestring from ..core.utils import (Frozen, FrozenOrderedDict, NdimSizeLenMixin, DunderArrayMixin) -from ..core.indexing import NumpyIndexingAdapter +from ..core.indexing import IndexableArrayAdapter, NumpyIndexingAdapter from .common import WritableCFDataStore, DataStorePickleMixin from .netcdf3 import (is_valid_nc3_name, encode_nc3_attr_value, @@ -31,9 +31,11 @@ def _decode_attrs(d): for (k, v) in iteritems(d)) -class ScipyArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): +class ScipyArrayWrapper(IndexableArrayAdapter, NdimSizeLenMixin, + DunderArrayMixin): def __init__(self, variable_name, datastore): + super(ScipyArrayWrapper, self).__init__('broadcast') self.datastore = datastore self.variable_name = variable_name array = self.get_array() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 41e8e246dd8..e1b66d629ef 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -368,7 +368,29 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer -class LazilyIndexedArray(utils.NDArrayMixin): +class IndexableArrayAdapter(object): + """ Base class for array adapters subject for orthogonal-indexing or + broadcasted-indexing. + + indexing_type: One of `orthogonal` or `broadcast` + """ + def __init__(self, indexing_type='orthogonal'): + assert indexing_type in ['orthogonal', 'broadcast'] + self._indexing_type = indexing_type + + @property + def indexing_type(self): + return self._indexing_type + + +def indexing_type_of(array): + if isinstance(array, np.ndarray): + return 'broadcast' + else: + return getattr(array, 'indexing_type', 'orthogonal') + + +class LazilyIndexedArray(IndexableArrayAdapter, utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy This is array is indexed by broadcasted-indexing. For using broadcasted @@ -384,6 +406,7 @@ def __init__(self, array, key=None): Array indexer. If provided, it is assumed to already be in canonical expanded form. """ + super(LazilyIndexedArray, self).__init__(indexing_type='broadcast') # We need to ensure that self.array is not LazilyIndexedArray, # because LazilyIndexedArray is not orthogonaly indexable if isinstance(array, type(self)): @@ -391,6 +414,7 @@ def __init__(self, array, key=None): self.key = array.key if key is not None: self.key = self._updated_key(key) + else: if key is None: key = (slice(None),) * array.ndim @@ -418,7 +442,14 @@ def shape(self): return tuple(shape) def __array__(self, dtype=None): - return np.asarray(self.array[self.key], dtype=None) + if indexing_type_of(self.array) == 'broadcast': + # manual orthogonal indexing. + value = np.asarray(self.array, dtype=None) + for axis, subkey in reversed(list(enumerate(self.key))): + value = value[(slice(None),) * axis + (subkey,)] + return value + else: + return np.asarray(self.array[self.key], dtype=None) def __getitem__(self, key): key = expanded_indexer(key, self.ndim) @@ -429,6 +460,12 @@ def __setitem__(self, key, value): key = expanded_indexer(key, self.ndim) key = unbroadcast_indexes(key, self.shape) key = self._updated_key(key) + + if indexing_type_of(self.array) == 'broadcast': + # TODO Should prepare LazilyIndexedArray for + # BroadcastIndexableAdapter + raise NotImplementedError('LaziyIndexedArray wrapps ' + 'OrthogonalIndexableAdapter.') self.array[key] = value def __repr__(self): @@ -444,8 +481,9 @@ def _wrap_numpy_scalars(array): return array -class CopyOnWriteArray(utils.NDArrayMixin): +class CopyOnWriteArray(IndexableArrayAdapter, utils.NDArrayMixin): def __init__(self, array): + super(CopyOnWriteArray, self).__init__(indexing_type_of(array)) self.array = array self._copied = False @@ -465,8 +503,9 @@ def __setitem__(self, key, value): self.array[key] = value -class MemoryCachedArray(utils.NDArrayMixin): +class MemoryCachedArray(IndexableArrayAdapter, utils.NDArrayMixin): def __init__(self, array): + super(MemoryCachedArray, self).__init__(indexing_type_of(array)) self.array = _wrap_numpy_scalars(array) def _ensure_cached(self): @@ -515,10 +554,11 @@ def unbroadcast_indexes(key, shape): return tuple(orthogonal_keys) -class BroadcastIndexedAdapter(utils.NDArrayMixin): +class BroadcastIndexedAdapter(IndexableArrayAdapter, utils.NDArrayMixin): """ An array wrapper for orthogonally indexed arrays, such as netCDF in order to indexed by broadcasted indexers. """ def __init__(self, array): + super(BroadcastIndexedAdapter, self).__init__('broadcast') self.array = array def __array__(self, dtype=None): @@ -545,10 +585,11 @@ def broadcasted_indexable(array): return array -class NumpyIndexingAdapter(utils.NDArrayMixin): +class NumpyIndexingAdapter(IndexableArrayAdapter, utils.NDArrayMixin): """Wrap a NumPy array to use broadcasted indexing """ def __init__(self, array): + super(NumpyIndexingAdapter, self).__init__('broadcast') self.array = array def _ensure_ndarray(self, value): @@ -567,13 +608,14 @@ def __setitem__(self, key, value): self.array[key] = value -class DaskIndexingAdapter(utils.NDArrayMixin): +class DaskIndexingAdapter(IndexableArrayAdapter, utils.NDArrayMixin): """Wrap a dask array to support broadcasted-indexing. """ def __init__(self, array): """ This adapter is usually called in Variable.__getitem__ with array=Variable._broadcast_indexes """ + super(DaskIndexingAdapter, self).__init__('broadcast') self.array = array def _orthogonalize_indexes(self, key): @@ -609,11 +651,12 @@ def __setitem__(self, key, value): self.array[key] = value -class PandasIndexAdapter(utils.NDArrayMixin): +class PandasIndexAdapter(IndexableArrayAdapter, utils.NDArrayMixin): """Wrap a pandas.Index to be better about preserving dtypes and to handle indexing by length 1 tuples like numpy """ def __init__(self, array, dtype=None): + super(PandasIndexAdapter, self).__init__('broadcast') self.array = utils.safe_cast_to_index(array) if dtype is None: if isinstance(array, pd.PeriodIndex): From 1b4e854e805166e6abcfe184a736d6ff5e2929a6 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 09:44:12 +0900 Subject: [PATCH 026/113] Deprecate _unbroadcast_indexers and support IndexerTuple classes --- xarray/core/indexing.py | 162 +++++++++++++++------------------- xarray/core/variable.py | 62 +++++++++---- xarray/tests/test_indexing.py | 63 +++---------- xarray/tests/test_variable.py | 39 ++++++++ 4 files changed, 167 insertions(+), 159 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e1b66d629ef..c8d315042bb 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -42,6 +42,7 @@ def expanded_indexer(key, ndim): return tuple(new_key) +# TODO deprecate def canonicalize_indexer(key, ndim): """Given an indexer for orthogonal array indexing, return an indexer that is a tuple composed entirely of slices, integer ndarrays and native python @@ -368,6 +369,51 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer +class IndexerTuple(tuple): + """ Base class for xarray indexing tuples """ + def __repr__(self): + return type(self).__name__ + super(IndexerTuple, self).__repr__() + + +class BasicIndexer(IndexerTuple): + """ Tuple for basic indexing. """ + + +class OuterIndexer(IndexerTuple): + """ Tuple for outer/orthogonal indexing. + All the item is one of integer, slice, and 1d-np.ndarray. + """ + def vectorize(self, shape): + """ Convert to a vectorized indexer. + shape: shape of the array subject to the indexing. + """ + if len([k for k in self if not isinstance(k, slice)]) <= 1: + # if there is only one vector and all others are slice, + # it can be safely converted to vectorized indexer + return VectorizedIndexer(self) + else: + n_dim = len([k for k in self if not isinstance(k, integer_types)]) + i_dim = 0 + new_key = [] + for k, size in zip(self, shape): + if isinstance(k, integer_types): + new_key.append(k) + else: # np.ndarray or slice + if isinstance(k, slice): + k = np.arange(*k.indices(size)) + if k.dtype.kind == 'b': + k = k.nonzero()[0] + shape = [(1,) * i_dim + (k.size, ) + + (1,) * (n_dim - i_dim - 1)] + new_key.append(k.reshape(*shape)) + i_dim += 1 + return VectorizedIndexer(new_key) + + +class VectorizedIndexer(IndexerTuple): + """ Tuple for vectorized indexing """ + + class IndexableArrayAdapter(object): """ Base class for array adapters subject for orthogonal-indexing or broadcasted-indexing. @@ -390,7 +436,7 @@ def indexing_type_of(array): return getattr(array, 'indexing_type', 'orthogonal') -class LazilyIndexedArray(IndexableArrayAdapter, utils.NDArrayMixin): +class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy This is array is indexed by broadcasted-indexing. For using broadcasted @@ -406,9 +452,7 @@ def __init__(self, array, key=None): Array indexer. If provided, it is assumed to already be in canonical expanded form. """ - super(LazilyIndexedArray, self).__init__(indexing_type='broadcast') - # We need to ensure that self.array is not LazilyIndexedArray, - # because LazilyIndexedArray is not orthogonaly indexable + # We need to avoid doubly wrapping. if isinstance(array, type(self)): self.array = array.array self.key = array.key @@ -418,6 +462,8 @@ def __init__(self, array, key=None): else: if key is None: key = (slice(None),) * array.ndim + if len(key) > 0: + key = OuterIndexer(key) self.array = array self.key = key @@ -429,7 +475,7 @@ def _updated_key(self, new_key): key.append(k) else: key.append(_index_indexer_1d(k, next(new_key), size)) - return tuple(key) + return () if len(key) == 0 else OuterIndexer(key) @property def shape(self): @@ -442,30 +488,23 @@ def shape(self): return tuple(shape) def __array__(self, dtype=None): - if indexing_type_of(self.array) == 'broadcast': - # manual orthogonal indexing. - value = np.asarray(self.array, dtype=None) - for axis, subkey in reversed(list(enumerate(self.key))): - value = value[(slice(None),) * axis + (subkey,)] - return value - else: - return np.asarray(self.array[self.key], dtype=None) + array = broadcasted_indexable(self.array) + return np.asarray(array[self.key], dtype=None) def __getitem__(self, key): + if isinstance(key, VectorizedIndexer): + raise NotImplementedError('Vectorized indexing for {} is not ' + 'implemented.'.format(type(self))) key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexes(key, self.shape) + print(key) return type(self)(self.array, self._updated_key(key)) def __setitem__(self, key, value): + if isinstance(key, VectorizedIndexer): + raise NotImplementedError('Vectorized indexing for {} is not ' + 'implemented.'.format(type(self))) key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexes(key, self.shape) key = self._updated_key(key) - - if indexing_type_of(self.array) == 'broadcast': - # TODO Should prepare LazilyIndexedArray for - # BroadcastIndexableAdapter - raise NotImplementedError('LaziyIndexedArray wrapps ' - 'OrthogonalIndexableAdapter.') self.array[key] = value def __repr__(self): @@ -481,9 +520,8 @@ def _wrap_numpy_scalars(array): return array -class CopyOnWriteArray(IndexableArrayAdapter, utils.NDArrayMixin): +class CopyOnWriteArray(utils.NDArrayMixin): def __init__(self, array): - super(CopyOnWriteArray, self).__init__(indexing_type_of(array)) self.array = array self._copied = False @@ -503,9 +541,8 @@ def __setitem__(self, key, value): self.array[key] = value -class MemoryCachedArray(IndexableArrayAdapter, utils.NDArrayMixin): +class MemoryCachedArray(utils.NDArrayMixin): def __init__(self, array): - super(MemoryCachedArray, self).__init__(indexing_type_of(array)) self.array = _wrap_numpy_scalars(array) def _ensure_cached(self): @@ -523,58 +560,6 @@ def __setitem__(self, key, value): self.array[key] = value -def unbroadcast_indexes(key, shape): - """ - Convert broadcasted indexers to orthogonal indexers. - If there is no valid mapping, raises IndexError. - - key is usually generated by Variable._broadcast_indexes. - - key: tuple of np.ndarray, slice, integer - shape: shape of array - """ - key = expanded_indexer(key, len(shape)) - - if all(isinstance(k, integer_types + (slice,)) for k in key): - # basic indexing - return key - - i_dim = 0 - orthogonal_keys = [] - for k in key: - if hasattr(k, '__len__'): # array - if k.shape[i_dim] != k.size: - raise IndexError( - "Indexer cannot be orthogonalized: {}".format(k)) - else: - i_dim += 1 - orthogonal_keys.append(np.ravel(k)) - else: # integer - orthogonal_keys.append(k) - return tuple(orthogonal_keys) - - -class BroadcastIndexedAdapter(IndexableArrayAdapter, utils.NDArrayMixin): - """ An array wrapper for orthogonally indexed arrays, such as netCDF - in order to indexed by broadcasted indexers. """ - def __init__(self, array): - super(BroadcastIndexedAdapter, self).__init__('broadcast') - self.array = array - - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) - - def __getitem__(self, key): - key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexes(key, self.shape) - return type(self)(self.array[key]) - - def __setitem__(self, key, value): - key = expanded_indexer(key, self.ndim) - key = unbroadcast_indexes(key, self.shape) - self.array[key] = value - - def broadcasted_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) @@ -585,11 +570,10 @@ def broadcasted_indexable(array): return array -class NumpyIndexingAdapter(IndexableArrayAdapter, utils.NDArrayMixin): +class NumpyIndexingAdapter(utils.NDArrayMixin): """Wrap a NumPy array to use broadcasted indexing """ def __init__(self, array): - super(NumpyIndexingAdapter, self).__init__('broadcast') self.array = array def _ensure_ndarray(self, value): @@ -602,9 +586,13 @@ def _ensure_ndarray(self, value): return value def __getitem__(self, key): + if isinstance(key, OuterIndexer): + key = key.vectorize(self.shape) return self._ensure_ndarray(self.array[key]) def __setitem__(self, key, value): + if isinstance(key, OuterIndexer): + key = key.vectorize(self.shape) self.array[key] = value @@ -618,16 +606,8 @@ def __init__(self, array): super(DaskIndexingAdapter, self).__init__('broadcast') self.array = array - def _orthogonalize_indexes(self, key): - key = unbroadcast_indexes(key, self.shape) - # convert them to slice if possible - return tuple(k if isinstance(k, (integer_types, slice)) - else maybe_convert_to_slice(k, size) - for k, size in zip(key, self.shape)) - def __getitem__(self, key): - try: - key = self._orthogonalize_indexes(key) + if not isinstance(key, VectorizedIndexer): try: return self.array[key] except NotImplementedError: @@ -636,19 +616,17 @@ def __getitem__(self, key): for axis, subkey in reversed(list(enumerate(key))): value = value[(slice(None),) * axis + (subkey,)] return value - except IndexError: + else: # TODO should support vindex raise IndexError( - 'dask does not support fancy indexing with key: {}'.format(key)) + 'dask does not support vectorized indexing : {}'.format(key)) def __setitem__(self, key, value): - key = self._orthogonalize_indexes(key) raise TypeError("this variable's data is stored in a dask array, " 'which does not support item assignment. To ' 'assign to this variable, you must first load it ' 'into memory explicitly using the .load_data() ' 'method or accessing its .values attribute.') - self.array[key] = value class PandasIndexAdapter(IndexableArrayAdapter, utils.NDArrayMixin): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0fbe1d63834..0ea09fe766a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,7 +18,8 @@ from . import utils from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) -from .indexing import (PandasIndexAdapter, broadcasted_indexable) +from .indexing import (PandasIndexAdapter, broadcasted_indexable, BasicIndexer, + OuterIndexer, VectorizedIndexer) import xarray as xr # only for Dataset and DataArray @@ -404,25 +405,51 @@ def _broadcast_indexes(self, key): basic_indexing_types = integer_types + (slice,) if all([isinstance(k, basic_indexing_types) for k in key]): return self._broadcast_indexes_basic(key) - else: - vindexes = [np.asarray(k) for k in key if - not isinstance(k, integer_types + (slice,))] - # slices and only one vector, no integers. - if (len(vindexes) == 0 and vindexes[0].ndim == 1 and - sum([isinstance(k, slice) for k in key]) == len(key)-1): - return self._broadcast_indexes_1vector(key) - else: # fancy indexing - return self._broadcast_indexes_advanced(key) + + # Detect it can be mapped as an outer indexer + # If all key is unlabelled, or + # key can be mapped as an OuterIndexer. + if all(not isinstance(k, Variable) for k in key): + return self._broadcast_indexes_outer(key) + + # If all key is 1-dimensional and there are no duplicate labels, + # key can be mapped as an OuterIndexer. + dims = [] + for k, d in zip(key, self.dims): + if isinstance(k, Variable): + if len(k.dims) > 1: + return self._broadcast_indexes_advanced(key) + dims.append(k.dims[0]) + if not isinstance(k, integer_types): + dims.append(d) + + if len(set(dims)) == len(dims): + return self._broadcast_indexes_outer(key) + + return self._broadcast_indexes_advanced(key) def _broadcast_indexes_basic(self, key): dims = tuple(dim for k, dim in zip(key, self.dims) if not isinstance(k, integer_types)) - return dims, key + return dims, BasicIndexer(key) - def _broadcast_indexes_1vector(self, key): - dims = tuple(key.dims[0] if hasattr(k, 'dims') else dim - for k, dim in zip(key, self.dims)) - return dims, key + def _broadcast_indexes_outer(self, key): + dims = tuple(k.dims[0] if isinstance(k, Variable) else dim + for k, dim in zip(key, self.dims) + if not isinstance(k, integer_types)) + indexer = [] + for k in key: + if isinstance(k, Variable): + indexer.append(k.data) + elif isinstance(k, integer_types + (slice,)): + indexer.append(k) + else: + k = np.asarray(k) + if k.ndim > 1: + raise IndexError("Unlabelled multi-dimensional array " + "cannot be used for indexing.") + indexer.append(k) + return dims, OuterIndexer(indexer) def nonzero(self): """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ @@ -455,8 +482,9 @@ def _broadcast_indexes_advanced(self, key): variables = _broadcast_compat_variables(*variables) dims = variables[0].dims # all variables have the same dims # overwrite if there is integers - key = tuple(k if isinstance(k, integer_types) else variable.data - for variable, k in zip(variables, key)) + key = VectorizedIndexer(k if isinstance(k, integer_types) + else variable.data + for variable, k in zip(variables, key)) return dims, key def __getitem__(self, key): diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index d2ce2172c33..22f2f40fa57 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -367,58 +367,21 @@ def test_index_scalar(self): assert np.array(x[0][()]) == 'foo' -class Test_unbroadcast_indexes(TestCase): - def test(self): - original = np.random.rand(10, 20, 30) - v = Variable(('i', 'j', 'k'), original) - I = ReturnItem() - # test broadcasted indexers - indexers = [I[:], 0, -2, I[:3], [4, 1, 2, 3], [0], np.arange(10) < 5] - for i in indexers: - for j in indexers: - for k in indexers: - dims, indexer = v._broadcast_indexes((i, j, k)) - orthogonalized = indexing.unbroadcast_indexes( - indexer, v.shape) - dim_new, indexer_new = v._broadcast_indexes(orthogonalized) - - self.assertArrayEqual(original[indexer], - original[indexer_new]) - orthogonalized_new = indexing.unbroadcast_indexes( - indexer_new, v.shape) - self.assertArrayEqual(orthogonalized[0], - orthogonalized_new[0]) - self.assertArrayEqual(orthogonalized[0], - orthogonalized_new[0]) - - def test_error(self): - with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.unbroadcast_indexes((np.ones((2, 2)), np.ones((2, 1))), - shape=(3, 2)) - with self.assertRaisesRegexp(IndexError, 'Indexer cannot be'): - indexing.unbroadcast_indexes((np.ones((1, 2)), np.ones((2, 1))), - shape=(3, 2)) - - -class TestBroadcastIndexedAdapter(TestCase): - def test_basic(self): +class TestIndexerTuple(TestCase): + """ Make sure OuterIndexer.vectorize gives similar result to + v._broadcast_indexes_advanced + """ + def test_outer_indexer(self): original = np.random.rand(10, 20, 30) - v = Variable(('i', 'j', 'k'), original) - orthogonal = NumpyOrthogonalIndexingAdapter(original) - wrapped = indexing.BroadcastIndexedAdapter(orthogonal) + v = Variable(['i', 'j', 'k'], original) I = ReturnItem() - # test broadcasted indexers - indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] + # test orthogonally applied indexers + indexers = [I[:], 0, -2, I[:3], np.array([0, 1, 2, 3]), np.array([0]), + np.arange(10) < 5] for i in indexers: for j in indexers: for k in indexers: - actual_ortho = orthogonal[i, j, k] - dims, indexer = v._broadcast_indexes((i, j, k)) - expected = original[indexer] - actual = wrapped[indexer] - self.assertEqual(expected.shape, actual_ortho.shape) - self.assertArrayEqual(expected, actual_ortho) - self.assertEqual(expected.shape, actual.shape) - self.assertArrayEqual(expected, actual) - self.assertTrue(type(actual), - indexing.BroadcastIndexedAdapter) + outer_index = indexing.OuterIndexer((i, j, k)) + _, expected = v._broadcast_indexes_advanced((i, j, k)) + actual = outer_index.vectorize(v.shape) + self.assertArrayEqual(v.data[actual], v.data[expected]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 72aa95bbb05..bec5efe3c12 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -679,6 +679,45 @@ def test_repr_lazy_data(self): self.assertIn('200000 values with dtype', repr(v)) self.assertIsInstance(v._data, LazilyIndexedArray) + def test_detect_indexer_type(self): + """ Tests indexer type was correctly detected. """ + data = np.random.random((10, 11)) + v = Variable(['x', 'y'], data) + + _, ind = v._broadcast_indexes((0, 1)) + self.assertTrue(type(ind) == indexing.BasicIndexer) + + _, ind = v._broadcast_indexes((0, slice(0, 8, 2))) + self.assertTrue(type(ind) == indexing.BasicIndexer) + + _, ind = v._broadcast_indexes((0, [0, 1])) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + _, ind = v._broadcast_indexes(([0, 1], 1)) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + _, ind = v._broadcast_indexes(([0, 1], [1, 2])) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + _, ind = v._broadcast_indexes(([0, 1], slice(0, 8, 2))) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + vind = Variable(('a', ), [0, 1]) + _, ind = v._broadcast_indexes((vind, slice(0, 8, 2))) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + vind = Variable(('y', ), [0, 1]) + _, ind = v._broadcast_indexes((vind, 3)) + self.assertTrue(type(ind) == indexing.OuterIndexer) + + vind = Variable(('a', ), [0, 1]) + _, ind = v._broadcast_indexes((vind, vind)) + self.assertTrue(type(ind) == indexing.VectorizedIndexer) + + vind = Variable(('a', 'b'), [[0, 2], [1, 3]]) + _, ind = v._broadcast_indexes((vind, 3)) + self.assertTrue(type(ind) == indexing.VectorizedIndexer) + def test_items(self): data = np.random.random((10, 11)) v = Variable(['x', 'y'], data) From 36d052fc6a23c1ffb358645c6a4311fffd2fecdf Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 10:00:52 +0900 Subject: [PATCH 027/113] removed unintended prints. --- xarray/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c8d315042bb..144b39756b8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -496,7 +496,6 @@ def __getitem__(self, key): raise NotImplementedError('Vectorized indexing for {} is not ' 'implemented.'.format(type(self))) key = expanded_indexer(key, self.ndim) - print(key) return type(self)(self.array, self._updated_key(key)) def __setitem__(self, key, value): From 9bd53ca22ed36e54c2bcd8a229aaac36e73d39c8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 10:04:19 +0900 Subject: [PATCH 028/113] Some clean up. --- xarray/core/indexing.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 144b39756b8..ea6706ed82e 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -414,33 +414,8 @@ class VectorizedIndexer(IndexerTuple): """ Tuple for vectorized indexing """ -class IndexableArrayAdapter(object): - """ Base class for array adapters subject for orthogonal-indexing or - broadcasted-indexing. - - indexing_type: One of `orthogonal` or `broadcast` - """ - def __init__(self, indexing_type='orthogonal'): - assert indexing_type in ['orthogonal', 'broadcast'] - self._indexing_type = indexing_type - - @property - def indexing_type(self): - return self._indexing_type - - -def indexing_type_of(array): - if isinstance(array, np.ndarray): - return 'broadcast' - else: - return getattr(array, 'indexing_type', 'orthogonal') - - class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy - - This is array is indexed by broadcasted-indexing. For using broadcasted - indexers, use LazilyIndexedArray. """ def __init__(self, array, key=None): """ From 563cafa60907fd62fd8697b271626f7491d8d2be Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 10:16:13 +0900 Subject: [PATCH 029/113] Some small fix. --- xarray/backends/scipy_.py | 8 ++++---- xarray/core/indexing.py | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 8c27a532bd7..1805cb28635 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -11,7 +11,7 @@ from ..core.pycompat import iteritems, OrderedDict, basestring from ..core.utils import (Frozen, FrozenOrderedDict, NdimSizeLenMixin, DunderArrayMixin) -from ..core.indexing import IndexableArrayAdapter, NumpyIndexingAdapter +from ..core.indexing import NumpyIndexingAdapter, OuterIndexer from .common import WritableCFDataStore, DataStorePickleMixin from .netcdf3 import (is_valid_nc3_name, encode_nc3_attr_value, @@ -31,11 +31,9 @@ def _decode_attrs(d): for (k, v) in iteritems(d)) -class ScipyArrayWrapper(IndexableArrayAdapter, NdimSizeLenMixin, - DunderArrayMixin): +class ScipyArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): def __init__(self, variable_name, datastore): - super(ScipyArrayWrapper, self).__init__('broadcast') self.datastore = datastore self.variable_name = variable_name array = self.get_array() @@ -48,6 +46,8 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name].data def __getitem__(self, key): + if isinstance(key, OuterIndexer): + key = key.vectorize(self.shape) with self.datastore.ensure_open(autoclose=True): data = NumpyIndexingAdapter(self.get_array())[key] # Copy data if the source file is mmapped. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index ea6706ed82e..be57bd6cd83 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -443,6 +443,7 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): + # TODO should suport VectorizedIndexer new_key = iter(new_key) key = [] for size, k in zip(self.array.shape, self.key): @@ -570,14 +571,13 @@ def __setitem__(self, key, value): self.array[key] = value -class DaskIndexingAdapter(IndexableArrayAdapter, utils.NDArrayMixin): +class DaskIndexingAdapter(utils.NDArrayMixin): """Wrap a dask array to support broadcasted-indexing. """ def __init__(self, array): """ This adapter is usually called in Variable.__getitem__ with array=Variable._broadcast_indexes """ - super(DaskIndexingAdapter, self).__init__('broadcast') self.array = array def __getitem__(self, key): @@ -603,12 +603,11 @@ def __setitem__(self, key, value): 'method or accessing its .values attribute.') -class PandasIndexAdapter(IndexableArrayAdapter, utils.NDArrayMixin): +class PandasIndexAdapter(utils.NDArrayMixin): """Wrap a pandas.Index to be better about preserving dtypes and to handle indexing by length 1 tuples like numpy """ def __init__(self, array, dtype=None): - super(PandasIndexAdapter, self).__init__('broadcast') self.array = utils.safe_cast_to_index(array) if dtype is None: if isinstance(array, pd.PeriodIndex): From 1712060e2636d3a737d972de67c498fa0c8e59a8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 21:36:31 +0900 Subject: [PATCH 030/113] Care for boolean array. --- xarray/core/indexing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index be57bd6cd83..60ef4129f1c 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -390,7 +390,11 @@ def vectorize(self, shape): if len([k for k in self if not isinstance(k, slice)]) <= 1: # if there is only one vector and all others are slice, # it can be safely converted to vectorized indexer - return VectorizedIndexer(self) + # Boolean index should be converted to integer array. + return VectorizedIndexer( + [k.nonzero()[0] + if (isinstance(k, np.ndarray) and k.dtype.kind == 'b') + else k for k in self]) else: n_dim = len([k for k in self if not isinstance(k, integer_types)]) i_dim = 0 From 884423a79bdad599a13f7e2314099453ef154208 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 21:54:53 +0900 Subject: [PATCH 031/113] Always map boolean index to integer array. --- xarray/core/indexing.py | 5 +---- xarray/core/variable.py | 7 ++++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 60ef4129f1c..a859c806d01 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -391,10 +391,7 @@ def vectorize(self, shape): # if there is only one vector and all others are slice, # it can be safely converted to vectorized indexer # Boolean index should be converted to integer array. - return VectorizedIndexer( - [k.nonzero()[0] - if (isinstance(k, np.ndarray) and k.dtype.kind == 'b') - else k for k in self]) + return VectorizedIndexer(self) else: n_dim = len([k for k in self if not isinstance(k, integer_types)]) i_dim = 0 diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0ea09fe766a..ced4f7d1ba5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -440,15 +440,16 @@ def _broadcast_indexes_outer(self, key): indexer = [] for k in key: if isinstance(k, Variable): - indexer.append(k.data) - elif isinstance(k, integer_types + (slice,)): + k = k.data + + if isinstance(k, integer_types + (slice,)): indexer.append(k) else: k = np.asarray(k) if k.ndim > 1: raise IndexError("Unlabelled multi-dimensional array " "cannot be used for indexing.") - indexer.append(k) + indexer.append(k if k.dtype.kind != 'b' else k.nonzero()[0]) return dims, OuterIndexer(indexer) def nonzero(self): From c2e6f422feb0d171e05c93c878e33a5ed3e0b856 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 22:21:10 +0900 Subject: [PATCH 032/113] Takes care of boolean index in test_indexing --- xarray/tests/test_indexing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 22f2f40fa57..45e24b8e515 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -372,6 +372,10 @@ class TestIndexerTuple(TestCase): v._broadcast_indexes_advanced """ def test_outer_indexer(self): + def nonzero(x): + if isinstance(x, np.ndarray) and x.dtype.kind == 'b': + x = x.nonzero()[0] + return x original = np.random.rand(10, 20, 30) v = Variable(['i', 'j', 'k'], original) I = ReturnItem() @@ -381,7 +385,8 @@ def test_outer_indexer(self): for i in indexers: for j in indexers: for k in indexers: - outer_index = indexing.OuterIndexer((i, j, k)) + outer_index = indexing.OuterIndexer( + (nonzero(i), nonzero(j), nonzero(k))) _, expected = v._broadcast_indexes_advanced((i, j, k)) actual = outer_index.vectorize(v.shape) self.assertArrayEqual(v.data[actual], v.data[expected]) From 002eafaaa533613f5587cbb7de98d5af6f9c7599 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 25 Jul 2017 22:40:23 +0900 Subject: [PATCH 033/113] replace self.assertTrue by assert --- xarray/tests/test_indexing.py | 13 ++++----- xarray/tests/test_variable.py | 54 +++++++++++++++++------------------ 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 45e24b8e515..104ff155610 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -83,7 +83,7 @@ def maybe_boolean_array(array, size): expected = original[i][:, j][:, :, k] self.assertArrayEqual(actual, expected) # indivisual testing - self.assertTrue(orthogonal[np.array([0]), :, :].shape == (1, 20, 30)) + assert orthogonal[np.array([0]), :, :].shape == (1, 20, 30) self.assertArrayEqual(orthogonal[[0], :, :], original[[0], :, :]) @@ -299,8 +299,8 @@ def test_lazily_indexed_array(self): v_lazy[:, :, k][:, j][i]]: self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) - self.assertTrue(isinstance( - actual._data, indexing.LazilyIndexedArray)) + assert isinstance(actual._data, + indexing.LazilyIndexedArray) # test sequentially applied indexers indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] @@ -309,10 +309,9 @@ def test_lazily_indexed_array(self): actual = v_lazy[i][j] self.assertEqual(expected.shape, actual.shape) self.assertArrayEqual(expected, actual) - self.assertTrue(isinstance( - actual._data, indexing.LazilyIndexedArray)) - self.assertTrue(isinstance(actual._data.array, - NumpyOrthogonalIndexingAdapter)) + assert isinstance(actual._data, indexing.LazilyIndexedArray) + assert isinstance(actual._data.array, + NumpyOrthogonalIndexingAdapter) class TestCopyOnWriteArray(TestCase): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index bec5efe3c12..c0458aeb1f1 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -685,38 +685,38 @@ def test_detect_indexer_type(self): v = Variable(['x', 'y'], data) _, ind = v._broadcast_indexes((0, 1)) - self.assertTrue(type(ind) == indexing.BasicIndexer) + assert type(ind) == indexing.BasicIndexer _, ind = v._broadcast_indexes((0, slice(0, 8, 2))) - self.assertTrue(type(ind) == indexing.BasicIndexer) + assert type(ind) == indexing.BasicIndexer _, ind = v._broadcast_indexes((0, [0, 1])) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer _, ind = v._broadcast_indexes(([0, 1], 1)) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer _, ind = v._broadcast_indexes(([0, 1], [1, 2])) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer _, ind = v._broadcast_indexes(([0, 1], slice(0, 8, 2))) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer vind = Variable(('a', ), [0, 1]) _, ind = v._broadcast_indexes((vind, slice(0, 8, 2))) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer vind = Variable(('y', ), [0, 1]) _, ind = v._broadcast_indexes((vind, 3)) - self.assertTrue(type(ind) == indexing.OuterIndexer) + assert type(ind) == indexing.OuterIndexer vind = Variable(('a', ), [0, 1]) _, ind = v._broadcast_indexes((vind, vind)) - self.assertTrue(type(ind) == indexing.VectorizedIndexer) + assert type(ind) == indexing.VectorizedIndexer vind = Variable(('a', 'b'), [[0, 2], [1, 3]]) _, ind = v._broadcast_indexes((vind, 3)) - self.assertTrue(type(ind) == indexing.VectorizedIndexer) + assert type(ind) == indexing.VectorizedIndexer def test_items(self): data = np.random.random((10, 11)) @@ -754,24 +754,24 @@ def test_getitem_basic(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) v_new = v[dict(x=0)] - self.assertTrue(v_new.dims == ('y', )) + assert v_new.dims == ('y', ) self.assertArrayEqual(v_new, v._data[0]) v_new = v[dict(x=0, y=slice(None))] - self.assertTrue(v_new.dims == ('y', )) + assert v_new.dims == ('y', ) self.assertArrayEqual(v_new, v._data[0]) v_new = v[dict(x=0, y=1)] - self.assertTrue(v_new.dims == ()) + assert v_new.dims == () self.assertArrayEqual(v_new, v._data[0, 1]) v_new = v[dict(y=1)] - self.assertTrue(v_new.dims == ('x', )) + assert v_new.dims == ('x', ) self.assertArrayEqual(v_new, v._data[:, 1]) # tuple argument v_new = v[(slice(None), 1)] - self.assertTrue(v_new.dims == ('x', )) + assert v_new.dims == ('x', ) self.assertArrayEqual(v_new, v._data[:, 1]) def test_getitem_advanced(self): @@ -779,27 +779,27 @@ def test_getitem_advanced(self): # orthogonal indexing v_new = v[([0, 1], [1, 0])] - self.assertTrue(v_new.dims == ('x', 'y')) + assert v_new.dims == ('x', 'y') self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) v_new = v[[0, 1]] - self.assertTrue(v_new.dims == ('x', 'y')) + assert v_new.dims == ('x', 'y') self.assertArrayEqual(v_new, v._data[[0, 1]]) # with mixed arguments ind = Variable(['a'], [0, 1]) v_new = v[dict(x=[0, 1], y=ind)] - self.assertTrue(v_new.dims == ('x', 'a')) + assert v_new.dims == ('x', 'a') self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) # boolean indexing v_new = v[dict(x=[True, False], y=[False, True])] - self.assertTrue(v_new.dims == ('x', 'y')) + assert v_new.dims == ('x', 'y') self.assertArrayEqual(v_new, v.load()._data[0][1]) ind = Variable(['a'], [True, False]) v_new = v[dict(y=ind)] - self.assertTrue(v_new.dims == ('x', 'a')) + assert v_new.dims == ('x', 'a') self.assertArrayEqual(v_new, v.load()._data[:, 0:1]) def test_getitem_fancy(self): @@ -808,40 +808,40 @@ def test_getitem_fancy(self): ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) v_new = v[ind] - self.assertTrue(v_new.dims == ('a', 'b', 'y')) + assert v_new.dims == ('a', 'b', 'y') self.assertArrayEqual(v_new, v.load()._data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) v_new = v[dict(y=ind)] - self.assertTrue(v_new.dims == ('x', 'a', 'b')) + assert v_new.dims == ('x', 'a', 'b') self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=[1, 0], y=ind)] - self.assertTrue(v_new.dims == ('x', 'a', 'b')) + assert v_new.dims == ('x', 'a', 'b') self.assertArrayEqual(v_new, v.load()._data[[1, 0]][:, ind]) # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=0, y=ind)] - self.assertTrue(v_new.dims == ('a', 'b')) + assert v_new.dims == ('a', 'b') self.assertArrayEqual(v_new[0], v.load()._data[0][[0, 0]]) self.assertArrayEqual(v_new[1], v.load()._data[0][[1, 1]]) # with slice ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=slice(None), y=ind)] - self.assertTrue(v_new.dims == ('x', 'a', 'b')) + assert v_new.dims == ('x', 'a', 'b') self.assertArrayEqual(v_new, v.load()._data[:, [[0, 0], [1, 1]]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None))] - self.assertTrue(v_new.dims == ('a', 'b', 'y')) + assert v_new.dims == ('a', 'b', 'y') self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], :]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None, 1))] - self.assertTrue(v_new.dims == ('a', 'b', 'y')) + assert v_new.dims == ('a', 'b', 'y') self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], slice(None, 1)]) From eedfb3f04332d1cf3f79b737889f627a0ffb7fdc Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 29 Jul 2017 16:41:34 +0900 Subject: [PATCH 034/113] Fix based on shoyer's comments. --- xarray/backends/netCDF4_.py | 3 ++ xarray/core/indexing.py | 83 +++++++++-------------------------- xarray/core/variable.py | 43 ++++++++++-------- xarray/tests/test_indexing.py | 24 ---------- 4 files changed, 49 insertions(+), 104 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 93af50f4ae5..cac88108cc8 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -50,6 +50,9 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): + # Make sure it is not an OuterIndexer + key = tuple(key) + if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) else: diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a859c806d01..00e66aaf167 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -73,41 +73,6 @@ def _expand_slice(slice_, size): return np.arange(*slice_.indices(size)) -def maybe_convert_to_slice(indexer, size): - """Convert an indexer into an equivalent slice object, if possible. - - Arguments - --------- - indexer : int, slice or np.ndarray - If a numpy array, must have integer dtype. - size : integer - Integer size of the dimension to be indexed. - """ - if indexer.ndim != 1 or not isinstance(indexer, np.ndarray): - return indexer - - if indexer.size == 0: - return slice(0, 0) - - if indexer.min() < -size or indexer.max() >= size: - raise IndexError( - 'indexer has elements out of bounds for axis of size {}: {}' - .format(size, indexer)) - - indexer = np.where(indexer < 0, indexer + size, indexer) - if indexer.size == 1: - i = int(indexer[0]) - return slice(i, i + 1) - - start = int(indexer[0]) - step = int(indexer[1] - start) - stop = start + step * indexer.size - guess = slice(start, stop, step) - if np.array_equal(_expand_slice(guess, size), indexer): - return guess - return indexer - - # TODO should be deprecated def orthogonal_indexer(key, shape): """Given a key for orthogonal array indexing, returns an equivalent key @@ -403,7 +368,7 @@ def vectorize(self, shape): if isinstance(k, slice): k = np.arange(*k.indices(size)) if k.dtype.kind == 'b': - k = k.nonzero()[0] + (k, ) = k.nonzero() shape = [(1,) * i_dim + (k.size, ) + (1,) * (n_dim - i_dim - 1)] new_key.append(k.reshape(*shape)) @@ -438,21 +403,23 @@ def __init__(self, array, key=None): else: if key is None: key = (slice(None),) * array.ndim - if len(key) > 0: - key = OuterIndexer(key) + key = OuterIndexer(key) self.array = array self.key = key def _updated_key(self, new_key): # TODO should suport VectorizedIndexer - new_key = iter(new_key) + if isinstance(new_key, VectorizedIndexer): + raise NotImplementedError('Vectorized indexing for {} is not ' + 'implemented.'.format(type(self))) + new_key = iter(expanded_indexer(new_key, self.ndim)) key = [] for size, k in zip(self.array.shape, self.key): if isinstance(k, integer_types): key.append(k) else: key.append(_index_indexer_1d(k, next(new_key), size)) - return () if len(key) == 0 else OuterIndexer(key) + return OuterIndexer(key) @property def shape(self): @@ -465,21 +432,13 @@ def shape(self): return tuple(shape) def __array__(self, dtype=None): - array = broadcasted_indexable(self.array) + array = xarray_indexable(self.array) return np.asarray(array[self.key], dtype=None) def __getitem__(self, key): - if isinstance(key, VectorizedIndexer): - raise NotImplementedError('Vectorized indexing for {} is not ' - 'implemented.'.format(type(self))) - key = expanded_indexer(key, self.ndim) return type(self)(self.array, self._updated_key(key)) def __setitem__(self, key, value): - if isinstance(key, VectorizedIndexer): - raise NotImplementedError('Vectorized indexing for {} is not ' - 'implemented.'.format(type(self))) - key = expanded_indexer(key, self.ndim) key = self._updated_key(key) self.array[key] = value @@ -536,7 +495,7 @@ def __setitem__(self, key, value): self.array[key] = value -def broadcasted_indexable(array): +def xarray_indexable(array): if isinstance(array, np.ndarray): return NumpyIndexingAdapter(array) if isinstance(array, pd.Index): @@ -573,7 +532,7 @@ def __setitem__(self, key, value): class DaskIndexingAdapter(utils.NDArrayMixin): - """Wrap a dask array to support broadcasted-indexing. + """Wrap a dask array to support xarray-style indexing. """ def __init__(self, array): """ This adapter is usually called in Variable.__getitem__ with @@ -582,19 +541,19 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - if not isinstance(key, VectorizedIndexer): - try: - return self.array[key] - except NotImplementedError: - # manual orthogonal indexing. - value = self.array - for axis, subkey in reversed(list(enumerate(key))): - value = value[(slice(None),) * axis + (subkey,)] - return value - else: + if isinstance(key, VectorizedIndexer): # TODO should support vindex raise IndexError( - 'dask does not support vectorized indexing : {}'.format(key)) + 'dask does not support vectorized indexing : {}'.format(key)) + + try: + return self.array[key] + except NotImplementedError: + # manual orthogonal indexing. + value = self.array + for axis, subkey in reversed(list(enumerate(key))): + value = value[(slice(None),) * axis + (subkey,)] + return value def __setitem__(self, key, value): raise TypeError("this variable's data is stored in a dask array, " diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ced4f7d1ba5..939f714777b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,7 +18,7 @@ from . import utils from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) -from .indexing import (PandasIndexAdapter, broadcasted_indexable, BasicIndexer, +from .indexing import (PandasIndexAdapter, xarray_indexable, BasicIndexer, OuterIndexer, VectorizedIndexer) import xarray as xr # only for Dataset and DataArray @@ -29,6 +29,9 @@ pass +basic_indexing_types = integer_types + (slice,) + + class MissingDimensionsError(ValueError): """Error class used when we can't safely guess a dimension name. """ @@ -305,7 +308,7 @@ def data(self, data): @property def _indexable_data(self): - return broadcasted_indexable(self._data) + return xarray_indexable(self._data) def load(self): """Manually trigger loading of this variable's data from disk or a @@ -402,8 +405,7 @@ def _broadcast_indexes(self, key): key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size key = indexing.expanded_indexer(key, self.ndim) - basic_indexing_types = integer_types + (slice,) - if all([isinstance(k, basic_indexing_types) for k in key]): + if all(isinstance(k, basic_indexing_types) for k in key): return self._broadcast_indexes_basic(key) # Detect it can be mapped as an outer indexer @@ -442,23 +444,24 @@ def _broadcast_indexes_outer(self, key): if isinstance(k, Variable): k = k.data - if isinstance(k, integer_types + (slice,)): + if isinstance(k, basic_indexing_types): indexer.append(k) else: k = np.asarray(k) if k.ndim > 1: raise IndexError("Unlabelled multi-dimensional array " - "cannot be used for indexing.") - indexer.append(k if k.dtype.kind != 'b' else k.nonzero()[0]) + "cannot be used for indexing: {}".format( + k)) + indexer.append(k if k.dtype.kind != 'b' else np.flatnonzero(k)) return dims, OuterIndexer(indexer) - def nonzero(self): + def _nonzero(self): """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ # TODO we should replace dask's native nonzero # after https://github.com/dask/dask/issues/1076 is implemented. nonzeros = np.nonzero(self.data) - return tuple([as_variable(nz, name=dim) for nz, dim - in zip(nonzeros, self.dims)]) + return tuple(Variable((dim), nz) for nz, dim + in zip(nonzeros, self.dims)) def _broadcast_indexes_advanced(self, key): variables = [] @@ -477,25 +480,29 @@ def _broadcast_indexes_advanced(self, key): if variable.ndim > 1: raise IndexError("{}-dimensional boolean indexing is " "not supported. ".format(variable.ndim)) - variables.extend(list(variable.nonzero())) + variables.extend(variable._nonzero()) else: variables.append(variable) variables = _broadcast_compat_variables(*variables) dims = variables[0].dims # all variables have the same dims # overwrite if there is integers - key = VectorizedIndexer(k if isinstance(k, integer_types) - else variable.data - for variable, k in zip(variables, key)) + key = VectorizedIndexer(variable.data for variable, k + in zip(variables, key)) return dims, key def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. - NB. __getitem__ and __setitem__ implement "diagonal indexing" like - np.ndarray. + # TODO more docstrings. + NB. __getitem__ and __setitem__ implement xarray-style indexing, + where if keys are unlabelled arrays, we index the array orthogonally + with them. If keys are labelled array (such as Variables), they are + broadcasted with our usual scheme and then the array is indexed with + the broadcasted key, like numpy's fancy indexing. - This method will replace __getitem__ after we make sure its stability. + If you really want to do indexing like `x[x > 0]`, manipulate the numpy + array `x.values` directly. """ dims, index_tuple = self._broadcast_indexes(key) values = self._indexable_data[index_tuple] @@ -513,7 +520,7 @@ def __setitem__(self, key, value): See __getitem__ for more details. """ dims, index_tuple = self._broadcast_indexes(key) - data = broadcasted_indexable(self._data) + data = xarray_indexable(self._data) if isinstance(value, Variable): data[index_tuple] = value.set_dims(dims) else: diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 104ff155610..8913748c950 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -107,30 +107,6 @@ def test_expanded_indexer(self): with self.assertRaisesRegexp(IndexError, 'too many indices'): indexing.expanded_indexer(I[1, 2, 3], 2) - def test_maybe_convert_to_slice(self): - - cases = [ - (1,), - (1, 1), - (1, 2), - (10,), - (0, 10), - (5, 10), - (5, 8), - (None, 5), - (None, -3), - (0, 10, 2), - (10, None, -1), - (7, 3, -2), - ] - for case in cases: - slice_obj = slice(*case) - base_array = np.arange(*slice_obj.indices(10)) - for array in [base_array, base_array - 10]: - actual = indexing.maybe_convert_to_slice(array, 10) - self.assertArrayEqual(np.arange(10)[actual], - np.arange(10)[slice_obj]) - def test_orthogonal_indexer(self): x = np.random.randn(10, 11, 12, 13, 14) y = np.arange(5) From bb2e5153f23f4d5349092e11c61a585a8eab5066 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 29 Jul 2017 16:56:19 +0900 Subject: [PATCH 035/113] Added `to_tuple()` method to IndexerTuple class. --- xarray/backends/h5netcdf_.py | 1 + xarray/backends/netCDF4_.py | 3 +-- xarray/backends/pydap_.py | 1 + xarray/backends/pynio_.py | 1 + xarray/backends/rasterio_.py | 1 + xarray/backends/scipy_.py | 2 ++ xarray/core/indexing.py | 7 +++++++ 7 files changed, 14 insertions(+), 2 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index b1b595447d6..752f65fd6b7 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -16,6 +16,7 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): + key = key.to_tuple() if hasattr(key, 'to_tuple') else key with self.datastore.ensure_open(autoclose=True): return self.get_array()[key] diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index cac88108cc8..7594aa09f4c 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -50,8 +50,7 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - # Make sure it is not an OuterIndexer - key = tuple(key) + key = key.to_tuple() if hasattr(key, 'to_tuple') else key if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index a4ccbce5bc1..4f2ef37a1d8 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -27,6 +27,7 @@ def dtype(self): return np.dtype(t.typecode + str(t.size)) def __getitem__(self, key): + key = key.to_tuple() if hasattr(key, 'to_tuple') else key if not isinstance(key, tuple): key = (key,) for k in key: diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 449971a9145..57ae7cd33bf 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -28,6 +28,7 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name] def __getitem__(self, key): + key = key.to_tuple() if hasattr(key, 'to_tuple') else key with self.datastore.ensure_open(autoclose=True): array = self.get_array() if key == () and self.ndim == 0: diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 16f5a55fa69..d02da05fc1a 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -38,6 +38,7 @@ def shape(self): return self._shape def __getitem__(self, key): + key = key.to_tuple() if hasattr(key, 'to_tuple') else key # make our job a bit easier key = indexing.canonicalize_indexer(key, self._ndims) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 1805cb28635..55a7248a903 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -48,6 +48,8 @@ def get_array(self): def __getitem__(self, key): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) + + key = key.to_tuple() if hasattr(key, 'to_tuple') else key with self.datastore.ensure_open(autoclose=True): data = NumpyIndexingAdapter(self.get_array())[key] # Copy data if the source file is mmapped. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 00e66aaf167..d4f4945b6a1 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -336,6 +336,10 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): class IndexerTuple(tuple): """ Base class for xarray indexing tuples """ + def to_tuple(self): + """ Converts to a native python tuple """ + return tuple(self) + def __repr__(self): return type(self).__name__ + super(IndexerTuple, self).__repr__() @@ -523,11 +527,13 @@ def _ensure_ndarray(self, value): def __getitem__(self, key): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) + key = key.to_tuple() if hasattr(key, 'to_tuple') else key return self._ensure_ndarray(self.array[key]) def __setitem__(self, key, value): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) + key = key.to_tuple() if hasattr(key, 'to_tuple') else key self.array[key] = value @@ -547,6 +553,7 @@ def __getitem__(self, key): 'dask does not support vectorized indexing : {}'.format(key)) try: + key = key.to_tuple() if hasattr(key, 'to_tuple') else key return self.array[key] except NotImplementedError: # manual orthogonal indexing. From 5983a67d6e3c9c6088e62a0e5c3180f5444610c6 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 29 Jul 2017 17:13:08 +0900 Subject: [PATCH 036/113] Removed: 'orthogonal_indexer', 'canonicalize_indexer' --- xarray/backends/rasterio_.py | 3 -- xarray/core/indexing.py | 74 +------------------------------ xarray/tests/test_indexing.py | 83 ++++++++++++++++++++++++++++++++--- 3 files changed, 79 insertions(+), 81 deletions(-) diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index d02da05fc1a..0b952f6b15c 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -40,9 +40,6 @@ def shape(self): def __getitem__(self, key): key = key.to_tuple() if hasattr(key, 'to_tuple') else key - # make our job a bit easier - key = indexing.canonicalize_indexer(key, self._ndims) - # bands cannot be windowed but they can be listed band_key = key[0] n_bands = self.shape[0] diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d4f4945b6a1..e017015e9a0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,7 @@ from . import utils from .pycompat import (iteritems, range, integer_types, dask_array_type, suppress) -from .utils import is_full_slice, is_dict_like +from .utils import is_dict_like def expanded_indexer(key, ndim): @@ -42,82 +42,10 @@ def expanded_indexer(key, ndim): return tuple(new_key) -# TODO deprecate -def canonicalize_indexer(key, ndim): - """Given an indexer for orthogonal array indexing, return an indexer that - is a tuple composed entirely of slices, integer ndarrays and native python - ints. - """ - def canonicalize(indexer): - if not isinstance(indexer, slice): - indexer = np.asarray(indexer) - if indexer.ndim == 0: - indexer = int(np.asscalar(indexer)) - else: - if indexer.ndim != 1: - raise ValueError('orthogonal array indexing only supports ' - '1d arrays') - if indexer.dtype.kind == 'b': - indexer, = np.nonzero(indexer) - elif indexer.dtype.kind != 'i': - raise ValueError('invalid subkey %r for integer based ' - 'array indexing; all subkeys must be ' - 'slices, integers or sequences of ' - 'integers or Booleans' % indexer) - return indexer - - return tuple(canonicalize(k) for k in expanded_indexer(key, ndim)) - - def _expand_slice(slice_, size): return np.arange(*slice_.indices(size)) -# TODO should be deprecated -def orthogonal_indexer(key, shape): - """Given a key for orthogonal array indexing, returns an equivalent key - suitable for indexing a numpy.ndarray with fancy indexing. - """ - # replace Ellipsis objects with slices - key = list(canonicalize_indexer(key, len(shape))) - # replace 1d arrays and slices with broadcast compatible arrays - # note: we treat integers separately (instead of turning them into 1d - # arrays) because integers (and only integers) collapse axes when used with - # __getitem__ - non_int_keys = [n for n, k in enumerate(key) - if not isinstance(k, integer_types)] - - def full_slices_unselected(n_list): - def all_full_slices(key_index): - return all(is_full_slice(key[n]) for n in key_index) - if not n_list: - return n_list - elif all_full_slices(range(n_list[0] + 1)): - return full_slices_unselected(n_list[1:]) - elif all_full_slices(range(n_list[-1], len(key))): - return full_slices_unselected(n_list[:-1]) - else: - return n_list - - # However, testing suggests it is OK to keep contiguous sequences of full - # slices at the start or the end of the key. Keeping slices around (when - # possible) instead of converting slices to arrays significantly speeds up - # indexing. - # (Honestly, I don't understand when it's not OK to keep slices even in - # between integer indices if as array is somewhere in the key, but such are - # the admittedly mind-boggling ways of numpy's advanced indexing.) - array_keys = full_slices_unselected(non_int_keys) - - def maybe_expand_slice(k, length): - return _expand_slice(k, length) if isinstance(k, slice) else k - - array_indexers = np.ix_(*(maybe_expand_slice(key[n], shape[n]) - for n in array_keys)) - for i, n in enumerate(array_keys): - key[n] = array_indexers[i] - return tuple(key) - - def _try_get_item(x): try: return x.item() diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 8913748c950..a8ada0f8090 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -6,12 +6,85 @@ from xarray import Dataset, DataArray, Variable from xarray.core import indexing, utils +from xarray.core.pycompat import integer_types from . import TestCase, ReturnItem +def canonicalize_indexer(key, ndim): + """Given an indexer for orthogonal array indexing, return an indexer that + is a tuple composed entirely of slices, integer ndarrays and native python + ints. + """ + def canonicalize(indexer): + if not isinstance(indexer, slice): + indexer = np.asarray(indexer) + if indexer.ndim == 0: + indexer = int(np.asscalar(indexer)) + else: + if indexer.ndim != 1: + raise ValueError('orthogonal array indexing only supports ' + '1d arrays') + if indexer.dtype.kind == 'b': + indexer, = np.nonzero(indexer) + elif indexer.dtype.kind != 'i': + raise ValueError('invalid subkey %r for integer based ' + 'array indexing; all subkeys must be ' + 'slices, integers or sequences of ' + 'integers or Booleans' % indexer) + return indexer + + return tuple(canonicalize(k) for k in indexing.expanded_indexer(key, ndim)) + + +def orthogonal_indexer(key, shape): + """Given a key for orthogonal array indexing, returns an equivalent key + suitable for indexing a numpy.ndarray with fancy indexing. + """ + # replace Ellipsis objects with slices + key = list(canonicalize_indexer(key, len(shape))) + # replace 1d arrays and slices with broadcast compatible arrays + # note: we treat integers separately (instead of turning them into 1d + # arrays) because integers (and only integers) collapse axes when used with + # __getitem__ + non_int_keys = [n for n, k in enumerate(key) + if not isinstance(k, integer_types)] + + def full_slices_unselected(n_list): + def all_full_slices(key_index): + return all(utils.is_full_slice(key[n]) for n in key_index) + if not n_list: + return n_list + elif all_full_slices(range(n_list[0] + 1)): + return full_slices_unselected(n_list[1:]) + elif all_full_slices(range(n_list[-1], len(key))): + return full_slices_unselected(n_list[:-1]) + else: + return n_list + + # However, testing suggests it is OK to keep contiguous sequences of full + # slices at the start or the end of the key. Keeping slices around (when + # possible) instead of converting slices to arrays significantly speeds up + # indexing. + # (Honestly, I don't understand when it's not OK to keep slices even in + # between integer indices if as array is somewhere in the key, but such are + # the admittedly mind-boggling ways of numpy's advanced indexing.) + array_keys = full_slices_unselected(non_int_keys) + + def maybe_expand_slice(k, length): + return indexing._expand_slice(k, length) if isinstance(k, slice) else k + + array_indexers = np.ix_(*(maybe_expand_slice(key[n], shape[n]) + for n in array_keys)) + for i, n in enumerate(array_keys): + key[n] = array_indexers[i] + return tuple(key) + + class NumpyOrthogonalIndexingAdapter(utils.NDArrayMixin): """Wrap a NumPy array to use orthogonal indexing (array indexing accesses different dimensions independently, like netCDF4-python variables) + + This class is only for testing. """ # note: this object is somewhat similar to biggus.NumpyArrayAdapter in that # it implements orthogonal indexing, except it casts to a numpy array, @@ -27,7 +100,7 @@ def _convert_key(self, key): if any(not isinstance(k, indexing.integer_types + (slice,)) for k in key): # key would trigger fancy indexing - key = indexing.orthogonal_indexer(key, self.shape) + key = orthogonal_indexer(key, self.shape) return key def _ensure_ndarray(self, value): @@ -119,7 +192,7 @@ def test_orthogonal_indexer(self): I[::-2], I[5::-2], I[:3:-2], I[2:5:-1], I[7:3:-2], I[:3, :4], I[:3, 0, :4], I[:3, 0, :4, 0], I[y], I[:, y], I[0, y], I[:2, :3, y], I[0, y, :, :4, 0]]: - j = indexing.orthogonal_indexer(i, x.shape) + j = orthogonal_indexer(i, x.shape) self.assertArrayEqual(x[i], x[j]) self.assertArrayEqual(self.set_to_zero(x, i), self.set_to_zero(x, j)) @@ -137,16 +210,16 @@ def test_orthogonal_indexer(self): (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13)), (I[:, :, y, :, 0], I[:, :, :5, :, 0], (10, 11, 5, 13)), (I[:, :, y, z, :], I[:, :, :5, 2:8:2], (10, 11, 5, 3, 14))]: - k = indexing.orthogonal_indexer(i, x.shape) + k = orthogonal_indexer(i, x.shape) self.assertEqual(shape, x[k].shape) self.assertArrayEqual(x[j], x[k]) self.assertArrayEqual(self.set_to_zero(x, j), self.set_to_zero(x, k)) # standard numpy (non-orthogonal) indexing doesn't work anymore with self.assertRaisesRegexp(ValueError, 'only supports 1d'): - indexing.orthogonal_indexer(x > 0, x.shape) + orthogonal_indexer(x > 0, x.shape) with self.assertRaisesRegexp(ValueError, 'invalid subkey'): - print(indexing.orthogonal_indexer((1.5 * y, 1.5 * y), x.shape)) + print(orthogonal_indexer((1.5 * y, 1.5 * y), x.shape)) def test_asarray_tuplesafe(self): res = indexing._asarray_tuplesafe(('a', 1)) From 7a5ff79164e7a7254532a290b2a11c37e6bcfbe6 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 29 Jul 2017 21:55:26 +0900 Subject: [PATCH 037/113] update IndexVariable.__getitem__ --- xarray/core/indexing.py | 1 + xarray/core/variable.py | 15 +++++++++------ xarray/tests/test_variable.py | 26 +++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e017015e9a0..7e6c1d10c90 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -541,6 +541,7 @@ def __getitem__(self, key): # objects don't like tuples) key, = key + key = key.to_tuple() if hasattr(key, 'to_tuple') else key result = self.array[key] if isinstance(result, pd.Index): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 939f714777b..d9954d5b280 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -483,11 +483,13 @@ def _broadcast_indexes_advanced(self, key): variables.extend(variable._nonzero()) else: variables.append(variable) - variables = _broadcast_compat_variables(*variables) + try: + variables = _broadcast_compat_variables(*variables) + except ValueError: + raise IndexError("Dimensions of indexers mismatch: {}".format(key)) dims = variables[0].dims # all variables have the same dims # overwrite if there is integers - key = VectorizedIndexer(variable.data for variable, k - in zip(variables, key)) + key = VectorizedIndexer(variable.data for variable in variables) return dims, key def __getitem__(self, key): @@ -1325,12 +1327,13 @@ def chunk(self, chunks=None, name=None, lock=False): return self.copy(deep=False) def __getitem__(self, key): - key = self._item_key_to_tuple(key) - values = self._indexable_data[key] + dims, index_tuple = self._broadcast_indexes(key) + assert len(dims) <= 1 + values = self._indexable_data[index_tuple] if not hasattr(values, 'ndim') or values.ndim == 0: return Variable((), values, self._attrs, self._encoding) else: - return type(self)(self.dims, values, self._attrs, + return type(self)(dims, values, self._attrs, self._encoding, fastpath=True) def __setitem__(self, key, value): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c0458aeb1f1..01ef4eb64b7 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -53,6 +53,30 @@ def test_getitem_dict(self): expected = v[0] self.assertVariableIdentical(expected, actual) + def test_getitem_1d(self): + v = self.cls(['x'], [0, 1, 2]) + + v_new = v[dict(x=[0, 1])] + assert v_new.dims == ('x', ) + self.assertArrayEqual(v_new, v._data[[0, 1]]) + + v_new = v[dict(x=slice(None))] + assert v_new.dims == ('x', ) + self.assertArrayEqual(v_new, v._data) + + v_new = v[dict(x=Variable('a', [0, 1]))] + assert v_new.dims == ('a', ) + self.assertArrayEqual(v_new, v._data[[0, 1]]) + + v_new = v[dict(x=1)] + assert v_new.dims == () + self.assertArrayEqual(v_new, v._data[1]) + + # tuple argument + v_new = v[slice(None)] + assert v_new.dims == ('x', ) + self.assertArrayEqual(v_new, v._data) + def _assertIndexedLikeNDArray(self, variable, expected_value0, expected_dtype=None): """Given a 1-dimensional variable, verify that the variable is indexed @@ -851,7 +875,7 @@ def test_getitem_error(self): with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): v[[[0, 1], [1, 2]]] - with self.assertRaisesRegexp(ValueError, "operands cannot be "): + with self.assertRaisesRegexp(IndexError, "Dimensions of indexers "): ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) ind_y = Variable(['a'], [0]) v[(ind_x, ind_y)] From 0b559bcfb1135b392ee2b341b051c0978593e2a0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 15:28:47 +0900 Subject: [PATCH 038/113] Made to_tuple function. --- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/netCDF4_.py | 2 +- xarray/backends/pydap_.py | 2 +- xarray/backends/pynio_.py | 2 +- xarray/backends/rasterio_.py | 2 +- xarray/backends/scipy_.py | 4 ++-- xarray/core/formatting.py | 5 +++-- xarray/core/indexing.py | 16 +++++++++------- 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 752f65fd6b7..3bab68007e1 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -16,7 +16,7 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = indexing.to_tuple(key) with self.datastore.ensure_open(autoclose=True): return self.get_array()[key] diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 7594aa09f4c..cd1462aa46b 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -50,7 +50,7 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = indexing.to_tuple(key) if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4f2ef37a1d8..c27bf4dac07 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -27,7 +27,7 @@ def dtype(self): return np.dtype(t.typecode + str(t.size)) def __getitem__(self, key): - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = indexing.to_tuple(key) if not isinstance(key, tuple): key = (key,) for k in key: diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 57ae7cd33bf..87083bf441a 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -28,7 +28,7 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name] def __getitem__(self, key): - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = indexing.to_tuple(key) with self.datastore.ensure_open(autoclose=True): array = self.get_array() if key == () and self.ndim == 0: diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 0b952f6b15c..418e0dbba95 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -38,7 +38,7 @@ def shape(self): return self._shape def __getitem__(self, key): - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = indexing.to_tuple(key) # bands cannot be windowed but they can be listed band_key = key[0] diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 55a7248a903..1e875d0f858 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -11,7 +11,7 @@ from ..core.pycompat import iteritems, OrderedDict, basestring from ..core.utils import (Frozen, FrozenOrderedDict, NdimSizeLenMixin, DunderArrayMixin) -from ..core.indexing import NumpyIndexingAdapter, OuterIndexer +from ..core.indexing import NumpyIndexingAdapter, OuterIndexer, to_tuple from .common import WritableCFDataStore, DataStorePickleMixin from .netcdf3 import (is_valid_nc3_name, encode_nc3_attr_value, @@ -49,7 +49,7 @@ def __getitem__(self, key): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = to_tuple(key) with self.datastore.ensure_open(autoclose=True): data = NumpyIndexingAdapter(self.get_array())[key] # Copy data if the source file is mmapped. diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 0996ef91cd9..9b21691a5b6 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -21,6 +21,7 @@ from .options import OPTIONS from .pycompat import PY2, unicode_type, bytes_type, dask_array_type +from .indexing import IndexerTuple def pretty_print(x, numchars): @@ -68,8 +69,8 @@ def _get_indexer_at_least_n_items(shape, n_desired): cum_items = np.cumprod(shape[::-1]) n_steps = np.argmax(cum_items >= n_desired) stop = int(np.ceil(float(n_desired) / np.r_[1, cum_items][n_steps])) - indexer = ((0, ) * (len(shape) - 1 - n_steps) + (slice(stop), ) + - (slice(None), ) * n_steps) + indexer = IndexerTuple((0, ) * (len(shape) - 1 - n_steps) + (slice(stop), ) + + (slice(None), ) * n_steps) return indexer diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7e6c1d10c90..8c8e452d6b9 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -264,14 +264,16 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): class IndexerTuple(tuple): """ Base class for xarray indexing tuples """ - def to_tuple(self): - """ Converts to a native python tuple """ - return tuple(self) def __repr__(self): return type(self).__name__ + super(IndexerTuple, self).__repr__() +def to_tuple(key): + """ Converts our indexer tuple to a native python tuple """ + return tuple(key) if isinstance(key, IndexerTuple) else key + + class BasicIndexer(IndexerTuple): """ Tuple for basic indexing. """ @@ -455,13 +457,13 @@ def _ensure_ndarray(self, value): def __getitem__(self, key): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = to_tuple(key) return self._ensure_ndarray(self.array[key]) def __setitem__(self, key, value): if isinstance(key, OuterIndexer): key = key.vectorize(self.shape) - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = to_tuple(key) self.array[key] = value @@ -481,7 +483,7 @@ def __getitem__(self, key): 'dask does not support vectorized indexing : {}'.format(key)) try: - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = to_tuple(key) return self.array[key] except NotImplementedError: # manual orthogonal indexing. @@ -541,7 +543,7 @@ def __getitem__(self, key): # objects don't like tuples) key, = key - key = key.to_tuple() if hasattr(key, 'to_tuple') else key + key = to_tuple(key) result = self.array[key] if isinstance(result, pd.Index): From bad828ea2124b1706e3d722d08a578afd3d05bef Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 15:40:28 +0900 Subject: [PATCH 039/113] BASIC_INDEXING_TYPES --- xarray/core/variable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d9954d5b280..3cfac6be1ce 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -29,7 +29,7 @@ pass -basic_indexing_types = integer_types + (slice,) +BASIC_INDEXING_TYPES = integer_types + (slice,) class MissingDimensionsError(ValueError): @@ -405,7 +405,7 @@ def _broadcast_indexes(self, key): key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size key = indexing.expanded_indexer(key, self.ndim) - if all(isinstance(k, basic_indexing_types) for k in key): + if all(isinstance(k, BASIC_INDEXING_TYPES) for k in key): return self._broadcast_indexes_basic(key) # Detect it can be mapped as an outer indexer @@ -444,7 +444,7 @@ def _broadcast_indexes_outer(self, key): if isinstance(k, Variable): k = k.data - if isinstance(k, basic_indexing_types): + if isinstance(k, BASIC_INDEXING_TYPES): indexer.append(k) else: k = np.asarray(k) From a821a2bb2745496ff72e2619ac7b1a8d73cbaea8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 15:42:55 +0900 Subject: [PATCH 040/113] Removed unused function from tests. --- xarray/tests/test_indexing.py | 198 +--------------------------------- 1 file changed, 3 insertions(+), 195 deletions(-) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index a8ada0f8090..249b745e6e9 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -5,161 +5,10 @@ import pandas as pd from xarray import Dataset, DataArray, Variable -from xarray.core import indexing, utils -from xarray.core.pycompat import integer_types +from xarray.core import indexing from . import TestCase, ReturnItem -def canonicalize_indexer(key, ndim): - """Given an indexer for orthogonal array indexing, return an indexer that - is a tuple composed entirely of slices, integer ndarrays and native python - ints. - """ - def canonicalize(indexer): - if not isinstance(indexer, slice): - indexer = np.asarray(indexer) - if indexer.ndim == 0: - indexer = int(np.asscalar(indexer)) - else: - if indexer.ndim != 1: - raise ValueError('orthogonal array indexing only supports ' - '1d arrays') - if indexer.dtype.kind == 'b': - indexer, = np.nonzero(indexer) - elif indexer.dtype.kind != 'i': - raise ValueError('invalid subkey %r for integer based ' - 'array indexing; all subkeys must be ' - 'slices, integers or sequences of ' - 'integers or Booleans' % indexer) - return indexer - - return tuple(canonicalize(k) for k in indexing.expanded_indexer(key, ndim)) - - -def orthogonal_indexer(key, shape): - """Given a key for orthogonal array indexing, returns an equivalent key - suitable for indexing a numpy.ndarray with fancy indexing. - """ - # replace Ellipsis objects with slices - key = list(canonicalize_indexer(key, len(shape))) - # replace 1d arrays and slices with broadcast compatible arrays - # note: we treat integers separately (instead of turning them into 1d - # arrays) because integers (and only integers) collapse axes when used with - # __getitem__ - non_int_keys = [n for n, k in enumerate(key) - if not isinstance(k, integer_types)] - - def full_slices_unselected(n_list): - def all_full_slices(key_index): - return all(utils.is_full_slice(key[n]) for n in key_index) - if not n_list: - return n_list - elif all_full_slices(range(n_list[0] + 1)): - return full_slices_unselected(n_list[1:]) - elif all_full_slices(range(n_list[-1], len(key))): - return full_slices_unselected(n_list[:-1]) - else: - return n_list - - # However, testing suggests it is OK to keep contiguous sequences of full - # slices at the start or the end of the key. Keeping slices around (when - # possible) instead of converting slices to arrays significantly speeds up - # indexing. - # (Honestly, I don't understand when it's not OK to keep slices even in - # between integer indices if as array is somewhere in the key, but such are - # the admittedly mind-boggling ways of numpy's advanced indexing.) - array_keys = full_slices_unselected(non_int_keys) - - def maybe_expand_slice(k, length): - return indexing._expand_slice(k, length) if isinstance(k, slice) else k - - array_indexers = np.ix_(*(maybe_expand_slice(key[n], shape[n]) - for n in array_keys)) - for i, n in enumerate(array_keys): - key[n] = array_indexers[i] - return tuple(key) - - -class NumpyOrthogonalIndexingAdapter(utils.NDArrayMixin): - """Wrap a NumPy array to use orthogonal indexing (array indexing - accesses different dimensions independently, like netCDF4-python variables) - - This class is only for testing. - """ - # note: this object is somewhat similar to biggus.NumpyArrayAdapter in that - # it implements orthogonal indexing, except it casts to a numpy array, - # isn't lazy and supports writing values. - def __init__(self, array): - self.array = np.asarray(array) - - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) - - def _convert_key(self, key): - key = indexing.expanded_indexer(key, self.ndim) - if any(not isinstance(k, indexing.integer_types + (slice,)) - for k in key): - # key would trigger fancy indexing - key = orthogonal_indexer(key, self.shape) - return key - - def _ensure_ndarray(self, value): - # We always want the result of indexing to be a NumPy array. If it's - # not, then it really should be a 0d array. Doing the coercion here - # instead of inside variable.as_compatible_data makes it less error - # prone. - if not isinstance(value, np.ndarray): - value = utils.to_0d_array(value) - return value - - def __getitem__(self, key): - key = self._convert_key(key) - return type(self)(self._ensure_ndarray(self.array[key])) - - def __setitem__(self, key, value): - key = self._convert_key(key) - self.array[key] = value - - -class TestNumpyOrthogonalIndexingAdapter(TestCase): - def test_basic(self): - def maybe_boolean_array(array, size): - """ Map boolean array to size 'size' by appendin False in its tail - """ - if hasattr(array, 'dtype') and array.dtype.kind == 'b': - array_new = np.ndarray(size, dtype='?') - array_new[:array.size] = array - array_new[array.size:] = False - return array_new - return array - - original = np.random.rand(10, 20, 30) - orthogonal = NumpyOrthogonalIndexingAdapter(original) - I = ReturnItem() - # test broadcasted indexers - indexers = [I[:], 0, -2, I[:3], [0, 1, 2, 3], [0], np.arange(10) < 5] - for i in indexers: - for j in indexers: - for k in indexers: - actual = orthogonal[i, j, k] - j = maybe_boolean_array(j, 20) - k = maybe_boolean_array(k, 30) - if isinstance(i, int): - if isinstance(j, int): - expected = original[i][j][k] - else: - expected = original[i][j][:, k] - else: - if isinstance(j, int): - expected = original[i][:, j][:, k] - else: - expected = original[i][:, j][:, :, k] - self.assertArrayEqual(actual, expected) - # indivisual testing - assert orthogonal[np.array([0]), :, :].shape == (1, 20, 30) - self.assertArrayEqual(orthogonal[[0], :, :], original[[0], :, :]) - - class TestIndexers(TestCase): def set_to_zero(self, x, i): x = x.copy() @@ -180,47 +29,6 @@ def test_expanded_indexer(self): with self.assertRaisesRegexp(IndexError, 'too many indices'): indexing.expanded_indexer(I[1, 2, 3], 2) - def test_orthogonal_indexer(self): - x = np.random.randn(10, 11, 12, 13, 14) - y = np.arange(5) - I = ReturnItem() - # orthogonal and numpy indexing should be equivalent, because we only - # use at most one array and it never in between two slice objects - # (i.e., we try to avoid numpy's mind-boggling "partial indexing" - # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html) - for i in [I[:], I[0], I[0, 0], I[:5], I[5:], I[2:5], I[3:-3], I[::-1], - I[::-2], I[5::-2], I[:3:-2], I[2:5:-1], I[7:3:-2], I[:3, :4], - I[:3, 0, :4], I[:3, 0, :4, 0], I[y], I[:, y], I[0, y], - I[:2, :3, y], I[0, y, :, :4, 0]]: - j = orthogonal_indexer(i, x.shape) - self.assertArrayEqual(x[i], x[j]) - self.assertArrayEqual(self.set_to_zero(x, i), - self.set_to_zero(x, j)) - # for more complicated cases, check orthogonal indexing is still - # equivalent to slicing - z = np.arange(2, 8, 2) - for i, j, shape in [ - (I[y, y], I[:5, :5], (5, 5, 12, 13, 14)), - (I[y, z], I[:5, 2:8:2], (5, 3, 12, 13, 14)), - (I[0, y, y], I[0, :5, :5], (5, 5, 13, 14)), - (I[y, 0, z], I[:5, 0, 2:8:2], (5, 3, 13, 14)), - (I[y, :, z], I[:5, :, 2:8:2], (5, 11, 3, 13, 14)), - (I[0, :, z], I[0, :, 2:8:2], (11, 3, 13, 14)), - (I[0, :2, y, y, 0], I[0, :2, :5, :5, 0], (2, 5, 5)), - (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13)), - (I[:, :, y, :, 0], I[:, :, :5, :, 0], (10, 11, 5, 13)), - (I[:, :, y, z, :], I[:, :, :5, 2:8:2], (10, 11, 5, 3, 14))]: - k = orthogonal_indexer(i, x.shape) - self.assertEqual(shape, x[k].shape) - self.assertArrayEqual(x[j], x[k]) - self.assertArrayEqual(self.set_to_zero(x, j), - self.set_to_zero(x, k)) - # standard numpy (non-orthogonal) indexing doesn't work anymore - with self.assertRaisesRegexp(ValueError, 'only supports 1d'): - orthogonal_indexer(x > 0, x.shape) - with self.assertRaisesRegexp(ValueError, 'invalid subkey'): - print(orthogonal_indexer((1.5 * y, 1.5 * y), x.shape)) - def test_asarray_tuplesafe(self): res = indexing._asarray_tuplesafe(('a', 1)) assert isinstance(res, np.ndarray) @@ -332,7 +140,7 @@ def test_slice_slice(self): def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) - x = NumpyOrthogonalIndexingAdapter(original) + x = indexing.NumpyIndexingAdapter(original) v = Variable(['i', 'j', 'k'], original) lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) @@ -360,7 +168,7 @@ def test_lazily_indexed_array(self): self.assertArrayEqual(expected, actual) assert isinstance(actual._data, indexing.LazilyIndexedArray) assert isinstance(actual._data.array, - NumpyOrthogonalIndexingAdapter) + indexing.NumpyIndexingAdapter) class TestCopyOnWriteArray(TestCase): From 65508800e5a50daabc241b481e0b7f2ba339c8f2 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 16:17:22 +0900 Subject: [PATCH 041/113] assert -> raise --- xarray/core/variable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3cfac6be1ce..4be13c23a6e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1328,7 +1328,9 @@ def chunk(self, chunks=None, name=None, lock=False): def __getitem__(self, key): dims, index_tuple = self._broadcast_indexes(key) - assert len(dims) <= 1 + if len(dims) > 1: + raise IndexError('Multiple dimension array cannot be used for ' + 'indexing IndexVariable: {}'.format(key)) values = self._indexable_data[index_tuple] if not hasattr(values, 'ndim') or values.ndim == 0: return Variable((), values, self._attrs, self._encoding) From 464e7115f96b388f2540288174fd9d636b36ea0a Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 19:41:29 +0900 Subject: [PATCH 042/113] Update Dataset.isel --- xarray/core/dataset.py | 34 ++++++++++---- xarray/tests/test_dataset.py | 86 ++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b02b954fb29..7a0e9fa0131 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1110,6 +1110,9 @@ def isel(self, drop=False, **indexers): **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given by integers, slice objects or arrays. + indexer can be a integer, slice, array-like or even DataArray. + If DataArrays are passed as indexers, xarray-style indexing will be + carried out. Returns ------- @@ -1123,27 +1126,42 @@ def isel(self, drop=False, **indexers): See Also -------- Dataset.sel - Dataset.sel_points - Dataset.isel_points DataArray.isel """ + from .dataarray import DataArray + invalid = [k for k in indexers if k not in self.dims] if invalid: raise ValueError("dimensions %r do not exist" % invalid) - # all indexers should be int, slice or np.ndarrays - indexers = [(k, (np.asarray(v) - if not isinstance(v, integer_types + (slice,)) - else v)) + # extract new coordinates from indexers + variables = OrderedDict() + for k, v in iteritems(indexers): + if isinstance(v, DataArray): + for c, var in v.coords.items(): + if c in variables: + if not variables[c].equals(var.variable): + raise ValueError('Inconsistent coordinates : {0}' + ' and {1}'.format(variables[c], + var)) + else: + variables[c] = var.variable + coord_names = set(self._coord_names) | set(variables) + + # a tuple, e.g. (('x', ), [0, 1]), is converted to Variable + # all indexers should be int, slice, np.ndarrays, or Variable + indexers = [(k, Variable(dims=v[0], data=v[1]) if isinstance(v, tuple) + else v if isinstance(v, integer_types + (slice, Variable)) + else v.variable if isinstance(v, DataArray) + else np.asarray(v)) for k, v in iteritems(indexers)] - variables = OrderedDict() for name, var in iteritems(self._variables): var_indexers = dict((k, v) for k, v in indexers if k in var.dims) new_var = var.isel(**var_indexers) if not (drop and name in var_indexers): variables[name] = new_var - coord_names = set(self._coord_names) & set(variables) + coord_names = coord_names & set(variables) return self._replace_vars_and_dims(variables, coord_names=coord_names) def sel(self, method=None, tolerance=None, drop=False, **indexers): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 950890e8ff0..381b6ef6416 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -846,6 +846,92 @@ def test_isel(self): self.assertItemsEqual(data.coords, ret.coords) self.assertItemsEqual(data.indexes, list(ret.indexes) + ['time']) + def test_isel_fancy(self): + # isel with fancy indexing. + data = create_test_data() + + pdim1 = [1, 2, 3] + pdim2 = [4, 5, 1] + pdim3 = [1, 2, 3] + actual = data.isel(dim1=(('test_coord'), pdim1), + dim2=(('test_coord'), pdim2), + dim3=(('test_coord'), pdim3)) + assert 'test_coord' in actual.dims + assert actual.coords['test_coord'].shape == (len(pdim1), ) + + # Should work with DataArray + actual = data.isel(dim1=DataArray(pdim1, dims='test_coord'), + dim2=(('test_coord'), pdim2), + dim3=(('test_coord'), pdim3)) + assert 'test_coord' in actual.dims + assert actual.coords['test_coord'].shape == (len(pdim1), ) + + actual = data.isel(dim1=(('points'), pdim1), dim2=(('points'), pdim2)) + assert 'points' in actual.dims + assert 'dim3' in actual.dims + assert 'dim3' not in actual.data_vars + np.testing.assert_array_equal(data['dim2'][pdim2], actual['dim2']) + + # test that the order of the indexers doesn't matter + self.assertDatasetIdentical(data.isel(dim1=(('points'), pdim1), + dim2=(('points'), pdim2)), + data.isel(dim2=(('points'), pdim2), + dim1=(('points'), pdim1))) + # make sure we're raising errors in the right places + with self.assertRaisesRegexp(IndexError, + 'Dimensions of indexers mismatch'): + data.isel(dim1=(('points'), [1, 2]), dim2=(('points'), [1, 2, 3])) + + # test to be sure we keep around variables that were not indexed + ds = Dataset({'x': [1, 2, 3, 4], 'y': 0}) + actual = ds.isel(x=(('points'), [0, 1, 2])) + self.assertDataArrayIdentical(ds['y'], actual['y']) + + # tests using index or DataArray as a dim + stations = Dataset() + stations['station'] = ('station', ['A', 'B', 'C']) + stations['dim1s'] = ('station', [1, 2, 3]) + stations['dim2s'] = ('station', [4, 5, 1]) + + actual = data.isel(dim1=stations['dim1s'], + dim2=stations['dim2s']) + assert 'station' in actual.coords + assert 'station' in actual.dims + self.assertDataArrayIdentical(actual['station'].drop(['dim2']), + stations['station']) + + with self.assertRaisesRegexp(ValueError, 'Inconsistent coordinates'): + data.isel(dim1=DataArray([0, 1, 2], dims='station', + coords={'station': [0, 1, 2]}), + dim2=DataArray([0, 1, 2], dims='station', + coords={'station': [0, 1, 3]})) + + # multi-dimensional selection + stations = Dataset() + stations['a'] = ('a', ['A', 'B', 'C']) + stations['b'] = ('b', [0, 1]) + stations['dim1s'] = (('a', 'b'), [[1, 2], [2, 3], [3, 4]]) + stations['dim2s'] = (('a', ), [4, 5, 1]) + + actual = data.isel(dim1=stations['dim1s'], + dim2=stations['dim2s']) + assert 'a' in actual.coords + assert 'a' in actual.dims + assert 'b' in actual.coords + assert 'b' in actual.dims + self.assertDataArrayIdentical(actual['a'].drop(['dim2']), + stations['a']) + self.assertDataArrayIdentical(actual['b'], stations['b']) + expected_var1 = data['var1'].variable[stations['dim1s'].variable, + stations['dim2s'].variable] + expected_var2 = data['var2'].variable[stations['dim1s'].variable, + stations['dim2s'].variable] + expected_var3 = data['var3'].variable[slice(None), + stations['dim1s'].variable] + self.assertArrayEqual(actual['var1'], expected_var1) + self.assertArrayEqual(actual['var2'], expected_var2) + self.assertArrayEqual(actual['var3'], expected_var3) + def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), From 7dd171d0618a12ac1ae143c70a302478e47e1af1 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 21:19:45 +0900 Subject: [PATCH 043/113] Use `merge_variables` in checking the consistency. --- xarray/core/dataset.py | 18 ++++++------------ xarray/tests/test_dataset.py | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7a0e9fa0131..3f6cd1678c3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -24,7 +24,7 @@ from .coordinates import DatasetCoordinates, LevelCoordinatesSource, Indexes from .common import ImplementsDatasetReduce, BaseDataObject, is_datetime_like from .merge import (dataset_update_method, dataset_merge_method, - merge_data_and_coords) + merge_data_and_coords, merge_variables) from .utils import (Frozen, SortedKeysDict, maybe_wrap_array, hashable, decode_numpy_dict_values, ensure_us_time_resolution) from .variable import (Variable, as_variable, IndexVariable, @@ -1135,17 +1135,11 @@ def isel(self, drop=False, **indexers): raise ValueError("dimensions %r do not exist" % invalid) # extract new coordinates from indexers - variables = OrderedDict() - for k, v in iteritems(indexers): - if isinstance(v, DataArray): - for c, var in v.coords.items(): - if c in variables: - if not variables[c].equals(var.variable): - raise ValueError('Inconsistent coordinates : {0}' - ' and {1}'.format(variables[c], - var)) - else: - variables[c] = var.variable + variables = merge_variables([v._coords for _, v in + iteritems(indexers) + if isinstance(v, DataArray)], + compat='identical') + coord_names = set(self._coord_names) | set(variables) # a tuple, e.g. (('x', ), [0, 1]), is converted to Variable diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 381b6ef6416..a462133e1a5 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -900,7 +900,7 @@ def test_isel_fancy(self): self.assertDataArrayIdentical(actual['station'].drop(['dim2']), stations['station']) - with self.assertRaisesRegexp(ValueError, 'Inconsistent coordinates'): + with self.assertRaisesRegexp(ValueError, 'conflicting values for '): data.isel(dim1=DataArray([0, 1, 2], dims='station', coords={'station': [0, 1, 2]}), dim2=DataArray([0, 1, 2], dims='station', From e8f006b2a4708038125d5c286ee09d667495ee53 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 30 Jul 2017 14:52:44 -0700 Subject: [PATCH 044/113] Cleanup Dataset.__getitem__ --- xarray/core/computation.py | 17 +++++++++++++++++ xarray/core/dataset.py | 28 ++++++++++++++++------------ xarray/tests/test_dataset.py | 2 ++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e866de2752a..8996cd61f57 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -136,6 +136,23 @@ def build_output_coords( signature, # type: _UFuncSignature exclude_dims=frozenset(), # type: set ): + """Build output coordinates for an operation. + + Parameters + ---------- + args : list + List of raw operation arguments. Any valid types for xarray operations + are OK, e.g., scalars, Variable, DataArray, Dataset. + signature : _UfuncSignature + Core dimensions signature for the operation. + exclude_dims : optional set + Dimensions excluded from the operation. Coordinates along these + dimensions are dropped. + + Returns + ------- + OrderedDict of Variable objects with merged coordinates. + """ # type: (...) -> List[OrderedDict[Any, Variable]] input_coords = _get_coord_variables(args) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3f6cd1678c3..4a32b48c508 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1135,23 +1135,27 @@ def isel(self, drop=False, **indexers): raise ValueError("dimensions %r do not exist" % invalid) # extract new coordinates from indexers - variables = merge_variables([v._coords for _, v in - iteritems(indexers) - if isinstance(v, DataArray)], - compat='identical') - + variables = merge_variables([v._coords for v in indexers.values() + if isinstance(v, DataArray)]) coord_names = set(self._coord_names) | set(variables) - # a tuple, e.g. (('x', ), [0, 1]), is converted to Variable # all indexers should be int, slice, np.ndarrays, or Variable - indexers = [(k, Variable(dims=v[0], data=v[1]) if isinstance(v, tuple) - else v if isinstance(v, integer_types + (slice, Variable)) - else v.variable if isinstance(v, DataArray) - else np.asarray(v)) - for k, v in iteritems(indexers)] + indexers_list = [] + for k, v in iteritems(indexers): + if isinstance(v, integer_types + (slice, Variable)): + pass + elif isinstance(v, DataArray): + v = v.variable + elif isinstance(v, tuple): + v = as_variable(v) + elif isinstance(v, Dataset): + raise TypeError('cannot use a Dataset as an indexer') + else: + v = np.asarray(v) + indexers_list.append((k, v)) for name, var in iteritems(self._variables): - var_indexers = dict((k, v) for k, v in indexers if k in var.dims) + var_indexers = {k: v for k, v in indexers_list if k in var.dims} new_var = var.isel(**var_indexers) if not (drop and name in var_indexers): variables[name] = new_var diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a462133e1a5..17836c0c7ca 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -881,6 +881,8 @@ def test_isel_fancy(self): with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mismatch'): data.isel(dim1=(('points'), [1, 2]), dim2=(('points'), [1, 2, 3])) + with self.assertRaisesRegexp(TypeError, 'cannot use a Dataset'): + data.isel(dim1=Dataset({'points': [1, 2]})) # test to be sure we keep around variables that were not indexed ds = Dataset({'x': [1, 2, 3, 4], 'y': 0}) From a8ec82b4aa638a1e21eb4dd5a4c8c5cc0eb3d59a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 30 Jul 2017 15:23:16 -0700 Subject: [PATCH 045/113] Add comment about why align() is unneeded --- xarray/core/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4a32b48c508..48e5a5bc2aa 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1135,6 +1135,8 @@ def isel(self, drop=False, **indexers): raise ValueError("dimensions %r do not exist" % invalid) # extract new coordinates from indexers + # we don't need to call align() explicitly, because merge_variables + # already checks for exact alignment between dimension coordinates variables = merge_variables([v._coords for v in indexers.values() if isinstance(v, DataArray)]) coord_names = set(self._coord_names) | set(variables) From 32749d415708be858cf39f2d194b68b35a4149a2 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 30 Jul 2017 21:58:16 -0700 Subject: [PATCH 046/113] Ensure correct tests are run in test_variable.py --- xarray/tests/test_variable.py | 246 ++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 119 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 01ef4eb64b7..d4293ba3313 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -494,6 +494,92 @@ def test_load(self): assert type(copied._data) is type(orig_data) self.assertVariableIdentical(array, copied) + def test_getitem_advanced(self): + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + # orthogonal indexing + v_new = v[([0, 1], [1, 0])] + assert v_new.dims == ('x', 'y') + self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) + + v_new = v[[0, 1]] + assert v_new.dims == ('x', 'y') + self.assertArrayEqual(v_new, v._data[[0, 1]]) + + # with mixed arguments + ind = Variable(['a'], [0, 1]) + v_new = v[dict(x=[0, 1], y=ind)] + assert v_new.dims == ('x', 'a') + self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) + + # boolean indexing + v_new = v[dict(x=[True, False], y=[False, True])] + assert v_new.dims == ('x', 'y') + self.assertArrayEqual(v_new, v.load()._data[0][1]) + + ind = Variable(['a'], [True, False]) + v_new = v[dict(y=ind)] + assert v_new.dims == ('x', 'a') + self.assertArrayEqual(v_new, v.load()._data[:, 0:1]) + + def test_getitem_fancy(self): + # Note This fancy getitem is not supported by dask-based Variable. + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) + v_new = v[ind] + assert v_new.dims == ('a', 'b', 'y') + self.assertArrayEqual(v_new, v.load()._data[([0, 1, 1], [1, 1, 0]), :]) + + ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) + v_new = v[dict(y=ind)] + assert v_new.dims == ('x', 'a', 'b') + self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=[1, 0], y=ind)] + assert v_new.dims == ('x', 'a', 'b') + self.assertArrayEqual(v_new, v.load()._data[[1, 0]][:, ind]) + + # with integer + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=0, y=ind)] + assert v_new.dims == ('a', 'b') + self.assertArrayEqual(v_new[0], v.load()._data[0][[0, 0]]) + self.assertArrayEqual(v_new[1], v.load()._data[0][[1, 1]]) + + # with slice + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=slice(None), y=ind)] + assert v_new.dims == ('x', 'a', 'b') + self.assertArrayEqual(v_new, v.load()._data[:, [[0, 0], [1, 1]]]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=ind, y=slice(None))] + assert v_new.dims == ('a', 'b', 'y') + self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], :]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=ind, y=slice(None, 1))] + assert v_new.dims == ('a', 'b', 'y') + self.assertArrayEqual(v_new, + v.load()._data[[[0, 0], [1, 1]], slice(None, 1)]) + + def test_getitem_error(self): + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + + with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): + v[[[0, 1], [1, 2]]] + + with self.assertRaisesRegexp(IndexError, "Dimensions of indexers "): + ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) + ind_y = Variable(['a'], [0]) + v[(ind_x, ind_y)] + + with self.assertRaisesRegexp(IndexError, "2-dimensional boolean"): + ind = Variable(['a', 'b'], [[True, False], [False, True]]) + v[dict(x=ind)] + class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) @@ -798,117 +884,6 @@ def test_getitem_basic(self): assert v_new.dims == ('x', ) self.assertArrayEqual(v_new, v._data[:, 1]) - def test_getitem_advanced(self): - v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - - # orthogonal indexing - v_new = v[([0, 1], [1, 0])] - assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) - - v_new = v[[0, 1]] - assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v._data[[0, 1]]) - - # with mixed arguments - ind = Variable(['a'], [0, 1]) - v_new = v[dict(x=[0, 1], y=ind)] - assert v_new.dims == ('x', 'a') - self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) - - # boolean indexing - v_new = v[dict(x=[True, False], y=[False, True])] - assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v.load()._data[0][1]) - - ind = Variable(['a'], [True, False]) - v_new = v[dict(y=ind)] - assert v_new.dims == ('x', 'a') - self.assertArrayEqual(v_new, v.load()._data[:, 0:1]) - - def test_getitem_fancy(self): - # Note This fancy getitem is not supported by dask-based Variable. - v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - - ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) - v_new = v[ind] - assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v.load()._data[([0, 1, 1], [1, 1, 0]), :]) - - ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) - v_new = v[dict(y=ind)] - assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) - - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=[1, 0], y=ind)] - assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[[1, 0]][:, ind]) - - # with integer - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=0, y=ind)] - assert v_new.dims == ('a', 'b') - self.assertArrayEqual(v_new[0], v.load()._data[0][[0, 0]]) - self.assertArrayEqual(v_new[1], v.load()._data[0][[1, 1]]) - - # with slice - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=slice(None), y=ind)] - assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[:, [[0, 0], [1, 1]]]) - - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=ind, y=slice(None))] - assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], :]) - - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=ind, y=slice(None, 1))] - assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, - v.load()._data[[[0, 0], [1, 1]], slice(None, 1)]) - - def test_getitem_error(self): - v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - - with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): - v[[[0, 1], [1, 2]]] - - with self.assertRaisesRegexp(IndexError, "Dimensions of indexers "): - ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) - ind_y = Variable(['a'], [0]) - v[(ind_x, ind_y)] - - with self.assertRaisesRegexp(IndexError, "2-dimensional boolean"): - ind = Variable(['a', 'b'], [[True, False], [False, True]]) - v[dict(x=ind)] - - def test_setitem(self): - v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - v[0, 1] = 1 - self.assertTrue(v[0, 1] == 1) - - v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - v[dict(x=[0, 1])] = 1 - self.assertArrayEqual(v[[0, 1]], np.ones_like(v[[0, 1]])) - - # boolean indexing - v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - v[dict(x=[True, False])] = 1 - - self.assertArrayEqual(v[0], np.ones_like(v[0])) - v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - v[dict(x=[True, False], y=[False, True])] = 1 - self.assertTrue(v[0, 1] == 1) - - # dimension broadcast - v = self.cls(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - ind = Variable(['a'], [0, 1]) - v[dict(x=ind)] = Variable(['a', 'y'], np.ones((2, 3), dtype=int) * 10) - self.assertArrayEqual(v[0], np.ones_like(v[0]) * 10) - self.assertArrayEqual(v[1], np.ones_like(v[0]) * 10) - def test_isel(self): v = Variable(['time', 'x'], self.d) self.assertVariableIdentical(v.isel(time=slice(None)), v) @@ -1287,13 +1262,36 @@ def test_count(self): actual = Variable(['x', 'y'], [[1, 0, np.nan], [1, 1, 1]]).count('y') self.assertVariableIdentical(expected, actual) + def test_setitem(self): + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[0, 1] = 1 + self.assertTrue(v[0, 1] == 1) + + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[0, 1])] = 1 + self.assertArrayEqual(v[[0, 1]], np.ones_like(v[[0, 1]])) + + # boolean indexing + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[True, False])] = 1 + + self.assertArrayEqual(v[0], np.ones_like(v[0])) + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + v[dict(x=[True, False], y=[False, True])] = 1 + self.assertTrue(v[0, 1] == 1) + + # dimension broadcast + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) + ind = Variable(['a'], [0, 1]) + v[dict(x=ind)] = Variable(['a', 'y'], np.ones((2, 3), dtype=int) * 10) + self.assertArrayEqual(v[0], np.ones_like(v[0]) * 10) + self.assertArrayEqual(v[1], np.ones_like(v[0]) * 10) + + @requires_dask -class TestVariable_withDask(TestVariable): +class TestVariable_withDask(TestCase, VariableSubclassTestCases): cls = staticmethod(lambda *args: Variable(*args).chunk()) - def setUp(self): - super(TestVariable_withDask, self).setUp() - @pytest.mark.xfail def test_0d_object_array_with_list(self): super(TestVariable_withDask, self).test_0d_object_array_with_list() @@ -1315,10 +1313,6 @@ def test_eq_all_dtypes(self): def test_getitem_fancy(self): super(TestVariable_withDask, self).test_getitem_fancy() - @pytest.mark.xfail - def test_setitem(self): - super(TestVariable_withDask, self).test_setitem() - class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) @@ -1398,6 +1392,20 @@ def test_coordinate_alias(self): x = Coordinate('x', [1, 2, 3]) self.assertIsInstance(x, IndexVariable) + # These tests make use of multi-dimensional variables, which are not valid + # IndexVariable objects: + @pytest.mark.xfail + def test_getitem_error(self): + super(TestIndexVariable, self).test_getitem_error() + + @pytest.mark.xfail + def test_getitem_advanced(self): + super(TestIndexVariable, self).test_getitem_advanced() + + @pytest.mark.xfail + def test_getitem_fancy(self): + super(TestIndexVariable, self).test_getitem_fancy() + class TestAsCompatibleData(TestCase): def test_unchanged_types(self): From 31401d4841f0dcfa7e902550d1d0793ee8620145 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 30 Jul 2017 22:55:10 -0700 Subject: [PATCH 047/113] Support pointwise indexing with dask --- xarray/core/indexing.py | 38 +++++++++- xarray/core/npcompat.py | 133 +++++++++++++++++++++++++++++++++- xarray/core/variable.py | 55 ++++++++++++-- xarray/tests/test_variable.py | 90 ++++++++++++++++++----- 4 files changed, 285 insertions(+), 31 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 8c8e452d6b9..02e5bc5ff8f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -7,6 +7,7 @@ import pandas as pd from . import utils +from .npcompat import moveaxis from .pycompat import (iteritems, range, integer_types, dask_array_type, suppress) from .utils import is_dict_like @@ -314,6 +315,10 @@ class VectorizedIndexer(IndexerTuple): """ Tuple for vectorized indexing """ +class PointwiseIndexer(IndexerTuple): + """ Tuple for pointwise indexing with dask.array's vindex """ + + class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy """ @@ -477,10 +482,11 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - if isinstance(key, VectorizedIndexer): - # TODO should support vindex - raise IndexError( - 'dask does not support vectorized indexing : {}'.format(key)) + # should always get PointwiseIndexer instead + assert not isinstance(key, VectorizedIndexer) + + if isinstance(key, PointwiseIndexer): + return self._getitem_pointwise(key) try: key = to_tuple(key) @@ -492,6 +498,30 @@ def __getitem__(self, key): value = value[(slice(None),) * axis + (subkey,)] return value + def _getitem_pointwise(self, key): + pointwise_shape, pointwise_index = next( + (k.shape, i) for i, k in enumerate(key) + if not isinstance(k, slice)) + # dask's indexing only handles 1d arrays + flat_key = tuple(k if isinstance(k, slice) else k.ravel() + for k in key) + + if len([k for k in key if not isinstance(k, slice)]) == 1: + # vindex requires more than one non-slice :( + # but we can use normal indexing instead + indexed = self.array[flat_key] + new_shape = (indexed.shape[:pointwise_index] + + pointwise_shape + + indexed.shape[pointwise_index + 1:]) + return indexed.reshape(new_shape) + else: + indexed = self.array.vindex[flat_key] + # vindex always moves slices to the end + reshaped = indexed.reshape(pointwise_shape + indexed.shape[1:]) + # reorder dimensions to match order of appearance + positions = np.arange(0, len(pointwise_shape)) + return moveaxis(reshaped, positions, positions + pointwise_index) + def __setitem__(self, key, value): raise TypeError("this variable's data is stored in a dask array, " 'which does not support item assignment. To ' diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 55b0286f999..499292530af 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -1,10 +1,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + +import operator + import numpy as np try: - from numpy import broadcast_to, stack, nanprod, nancumsum, nancumprod + from numpy import (broadcast_to, stack, nanprod, nancumsum, nancumprod, + moveaxis) except ImportError: # pragma: no cover # Code copied from newer versions of NumPy (v1.10 to v1.12). # Used under the terms of NumPy's license, see licenses/NUMPY_LICENSE. @@ -371,3 +375,130 @@ def nancumprod(a, axis=None, dtype=None, out=None): """ a, mask = _replace_nan(a, 1) return np.cumprod(a, axis=axis, dtype=dtype, out=out) + + + def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False): + """ + Normalizes an axis argument into a tuple of non-negative integer axes. + + This handles shorthands such as ``1`` and converts them to ``(1,)``, + as well as performing the handling of negative indices covered by + `normalize_axis_index`. + + By default, this forbids axes from being specified multiple times. + + Used internally by multi-axis-checking logic. + + .. versionadded:: 1.13.0 + + Parameters + ---------- + axis : int, iterable of int + The un-normalized index or indices of the axis. + ndim : int + The number of dimensions of the array that `axis` should be normalized + against. + argname : str, optional + A prefix to put before the error message, typically the name of the + argument. + allow_duplicate : bool, optional + If False, the default, disallow an axis from being specified twice. + + Returns + ------- + normalized_axes : tuple of int + The normalized axis index, such that `0 <= normalized_axis < ndim` + + Raises + ------ + AxisError + If any axis provided is out of range + ValueError + If an axis is repeated + + See also + -------- + normalize_axis_index : normalizing a single scalar axis + """ + try: + axis = [operator.index(axis)] + except TypeError: + axis = tuple(axis) + axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis) + if not allow_duplicate and len(set(axis)) != len(axis): + if argname: + raise ValueError('repeated axis in `{}` argument'.format(argname)) + else: + raise ValueError('repeated axis') + return axis + + + def moveaxis(a, source, destination): + """ + Move axes of an array to new positions. + + Other axes remain in their original order. + + .. versionadded::1.11.0 + + Parameters + ---------- + a : np.ndarray + The array whose axes should be reordered. + source : int or sequence of int + Original positions of the axes to move. These must be unique. + destination : int or sequence of int + Destination positions for each of the original axes. These must also be + unique. + + Returns + ------- + result : np.ndarray + Array with moved axes. This array is a view of the input array. + + See Also + -------- + transpose: Permute the dimensions of an array. + swapaxes: Interchange two axes of an array. + + Examples + -------- + + >>> x = np.zeros((3, 4, 5)) + >>> np.moveaxis(x, 0, -1).shape + (4, 5, 3) + >>> np.moveaxis(x, -1, 0).shape + (5, 3, 4) + + These all achieve the same result: + + >>> np.transpose(x).shape + (5, 4, 3) + >>> np.swapaxes(x, 0, -1).shape + (5, 4, 3) + >>> np.moveaxis(x, [0, 1], [-1, -2]).shape + (5, 4, 3) + >>> np.moveaxis(x, [0, 1, 2], [-1, -2, -3]).shape + (5, 4, 3) + + """ + try: + # allow duck-array types if they define transpose + transpose = a.transpose + except AttributeError: + a = np.asarray(a) + transpose = a.transpose + + source = normalize_axis_tuple(source, a.ndim, 'source') + destination = normalize_axis_tuple(destination, a.ndim, 'destination') + if len(source) != len(destination): + raise ValueError('`source` and `destination` arguments must have ' + 'the same number of elements') + + order = [n for n in range(a.ndim) if n not in source] + + for dest, src in sorted(zip(destination, source)): + order.insert(dest, src) + + result = transpose(order) + return result diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 4be13c23a6e..abdb630909e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -19,7 +19,7 @@ from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) from .indexing import (PandasIndexAdapter, xarray_indexable, BasicIndexer, - OuterIndexer, VectorizedIndexer) + OuterIndexer, PointwiseIndexer, VectorizedIndexer) import xarray as xr # only for Dataset and DataArray @@ -464,8 +464,11 @@ def _nonzero(self): in zip(nonzeros, self.dims)) def _broadcast_indexes_advanced(self, key): - variables = [] + if isinstance(self._data, dask_array_type): + # dask only supports a very restricted form of advanced indexing + return self._broadcast_indexes_dask_pointwise(key) + variables = [] for dim, value in zip(self.dims, key): if isinstance(value, slice): value = np.arange(*value.indices(self.sizes[dim])) @@ -492,6 +495,46 @@ def _broadcast_indexes_advanced(self, key): key = VectorizedIndexer(variable.data for variable in variables) return dims, key + def _broadcast_indexes_dask_pointwise(self, key): + if any(not isinstance(k, (Variable, slice)) for k in key): + raise IndexError( + 'Vectorized indexing with dask requires that all indexers are ' + 'labeled arrays or full slice objects: {}'.format(key)) + + if any(isinstance(k, Variable) and k.dtype.kind == 'b' for k in key): + raise IndexError( + 'Vectorized indexing with dask does not support booleans: {}' + .format(key)) + + dims_set = {k.dims for k in key if isinstance(k, Variable)} + if len(dims_set) != 1: + raise IndexError( + 'Vectorized indexing with dask requires that all labeled ' + 'arrays in the indexer have the same dimension names, but ' + 'arrays have different dimensions: {}'.format(key)) + (unique_dims,) = dims_set + + shapes_set = {k.shape for k in key if isinstance(k, Variable)} + if len(shapes_set) != 1: + # matches message in _broadcast_indexes_advanced + raise IndexError("Dimensions of indexers mismatch: {}".format(key)) + + dims = [] + found_first_array = False + for k, d in zip(key, self.dims): + if isinstance(k, slice): + if d in unique_dims: + raise IndexError( + 'Labeled arrays used in vectorized indexing with dask ' + 'cannot reuse a sliced dimension: {}'.format(d)) + dims.append(d) + elif not found_first_array: + dims.extend(k.dims) + found_first_array = True + + key = PointwiseIndexer(getattr(k, 'data', k) for k in key) + return tuple(dims), key + def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. @@ -507,12 +550,12 @@ def __getitem__(self, key): array `x.values` directly. """ dims, index_tuple = self._broadcast_indexes(key) - values = self._indexable_data[index_tuple] - if hasattr(values, 'ndim'): - assert values.ndim == len(dims), (values.ndim, len(dims)) + data = self._indexable_data[index_tuple] + if hasattr(data, 'ndim'): + assert data.ndim == len(dims), (data.ndim, len(dims)) else: assert len(dims) == 0, len(dims) - return type(self)(dims, values, self._attrs, self._encoding, + return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) def __setitem__(self, key, value): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d4293ba3313..c9896623c1f 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -496,74 +496,81 @@ def test_load(self): def test_getitem_advanced(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + v_data = v.compute().data # orthogonal indexing v_new = v[([0, 1], [1, 0])] assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v._data[[0, 1]][:, [1, 0]]) + self.assertArrayEqual(v_new, v_data[[0, 1]][:, [1, 0]]) v_new = v[[0, 1]] assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v._data[[0, 1]]) + self.assertArrayEqual(v_new, v_data[[0, 1]]) # with mixed arguments ind = Variable(['a'], [0, 1]) v_new = v[dict(x=[0, 1], y=ind)] assert v_new.dims == ('x', 'a') - self.assertArrayEqual(v_new, v.load()._data[[0, 1]][:, [0, 1]]) + self.assertArrayEqual(v_new, v_data[[0, 1]][:, [0, 1]]) # boolean indexing v_new = v[dict(x=[True, False], y=[False, True])] assert v_new.dims == ('x', 'y') - self.assertArrayEqual(v_new, v.load()._data[0][1]) + self.assertArrayEqual(v_new, v_data[0][1]) ind = Variable(['a'], [True, False]) v_new = v[dict(y=ind)] assert v_new.dims == ('x', 'a') - self.assertArrayEqual(v_new, v.load()._data[:, 0:1]) + self.assertArrayEqual(v_new, v_data[:, 0:1]) def test_getitem_fancy(self): - # Note This fancy getitem is not supported by dask-based Variable. + # dask-based Variable objects don't support this full set of operations. v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + v_data = v.compute().data ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) v_new = v[ind] assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v.load()._data[([0, 1, 1], [1, 1, 0]), :]) + self.assertArrayEqual(v_new, v_data[([0, 1, 1], [1, 1, 0]), :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) v_new = v[dict(y=ind)] assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[:, ([0, 1, 2], [2, 1, 0])]) + self.assertArrayEqual(v_new, v_data[:, ([0, 1, 2], [2, 1, 0])]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=[1, 0], y=ind)] assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[[1, 0]][:, ind]) + self.assertArrayEqual(v_new, v_data[[1, 0]][:, ind]) + + # along diagonal + ind = Variable(['a'], [0, 1]) + v_new = v[ind, ind] + assert v_new.dims == ('a',) + self.assertArrayEqual(v_new, v_data[[0, 1], [0, 1]]) # with integer ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=0, y=ind)] assert v_new.dims == ('a', 'b') - self.assertArrayEqual(v_new[0], v.load()._data[0][[0, 0]]) - self.assertArrayEqual(v_new[1], v.load()._data[0][[1, 1]]) + self.assertArrayEqual(v_new[0], v_data[0][[0, 0]]) + self.assertArrayEqual(v_new[1], v_data[0][[1, 1]]) # with slice ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=slice(None), y=ind)] assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v.load()._data[:, [[0, 0], [1, 1]]]) + self.assertArrayEqual(v_new, v_data[:, [[0, 0], [1, 1]]]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None))] assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v.load()._data[[[0, 0], [1, 1]], :]) + self.assertArrayEqual(v_new, v_data[[[0, 0], [1, 1]], :]) ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) v_new = v[dict(x=ind, y=slice(None, 1))] assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, - v.load()._data[[[0, 0], [1, 1]], slice(None, 1)]) + self.assertArrayEqual(v_new, v_data[[[0, 0], [1, 1]], slice(None, 1)]) def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) @@ -571,13 +578,16 @@ def test_getitem_error(self): with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): v[[[0, 1], [1, 2]]] + ind_x = Variable(['a'], [0, 1, 1]) + ind_y = Variable(['a'], [0, 1]) with self.assertRaisesRegexp(IndexError, "Dimensions of indexers "): - ind_x = Variable(['a', 'b'], [[0, 0], [1, 1]]) - ind_y = Variable(['a'], [0]) - v[(ind_x, ind_y)] + v[ind_x, ind_y] - with self.assertRaisesRegexp(IndexError, "2-dimensional boolean"): - ind = Variable(['a', 'b'], [[True, False], [False, True]]) + ind = Variable(['a', 'b'], [[True, False], [False, True]]) + msg = ('dask does not support booleans' + if type(self) is TestVariable_withDask + else '2-dimensional boolean') + with self.assertRaisesRegexp(IndexError, msg): v[dict(x=ind)] @@ -1313,6 +1323,46 @@ def test_eq_all_dtypes(self): def test_getitem_fancy(self): super(TestVariable_withDask, self).test_getitem_fancy() + def test_getitem_fancy(self): + # selectively copied from the superclass. + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + v_data = v.compute().data + + ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) + v_new = v[ind] + assert v_new.dims == ('a', 'b', 'y') + self.assertArrayEqual(v_new, v_data[([0, 1, 1], [1, 1, 0]), :]) + + ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) + v_new = v[dict(y=ind)] + assert v_new.dims == ('x', 'a', 'b') + self.assertArrayEqual(v_new, v_data[:, ([0, 1, 2], [2, 1, 0])]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + with self.assertRaisesRegexp(IndexError, 'same dimension names'): + v_new = v[dict(x=ind[:, 0], y=ind)] + + # along diagonal + ind = Variable(['a'], [0, 1]) + v_new = v[ind, ind] + assert v_new.dims == ('a',) + self.assertArrayEqual(v_new, v_data[[0, 1], [0, 1]]) + + # with slice + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=slice(None), y=ind)] + assert v_new.dims == ('x', 'a', 'b') + self.assertArrayEqual(v_new, v_data[:, [[0, 0], [1, 1]]]) + + ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) + v_new = v[dict(x=ind, y=slice(None))] + assert v_new.dims == ('a', 'b', 'y') + self.assertArrayEqual(v_new, v_data[[[0, 0], [1, 1]], :]) + + ind = Variable(['x', 'b'], [[0, 0], [1, 1]]) + with self.assertRaisesRegexp(IndexError, 'reuse a sliced dimension'): + v_new = v[dict(x=slice(None), y=ind)] + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From 8d96ad3ee292eadcf418af169b55d81a8174b1c1 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Aug 2017 00:13:07 -0700 Subject: [PATCH 048/113] Add a vindex routine for np.ndarray --- xarray/core/nputils.py | 54 ++++++++++++++++++++++++++++++++++++ xarray/tests/test_nputils.py | 30 ++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 xarray/tests/test_nputils.py diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 5ebab4ec407..3c4982ac0bf 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -5,6 +5,8 @@ import pandas as pd import warnings +from .npcompat import moveaxis + def _validate_axis(data, axis): ndim = data.ndim @@ -79,3 +81,55 @@ def array_ne(self, other): with warnings.catch_warnings(): warnings.filterwarnings('ignore', r'elementwise comparison failed') return _ensure_bool_is_ndarray(self != other, self, other) + + +def _is_contiguous(positions): + """Given a non-empty list, does it consist of contiguous integers?""" + previous = positions[0] + for current in positions[1:]: + if current != previous + 1: + return False + previous = current + return True + + +def _advanced_indexer_subspaces(key): + """Indices of the advanced indexes subspaces for mixed indexing and vindex. + """ + if not isinstance(key, tuple): + key = (key,) + advanced_index_positions = [i for i, k in enumerate(key) + if not isinstance(k, slice)] + + if (not advanced_index_positions or + not _is_contiguous(advanced_index_positions)): + # Nothing to reorder: dimensions on the indexing result are already + # ordered like vindex. See NumPy's rule for "Combining advanced and + # basic indexing": + # https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing + return (), () + + non_slices = [k for k in key if not isinstance(k, slice)] + ndim = len(np.broadcast(*non_slices).shape) + mixed_positions = advanced_index_positions[0] + np.arange(ndim) + vindex_positions = np.arange(ndim) + return mixed_positions, vindex_positions + + +class VectorizedIndex(object): + """Object that implements indexing like vindex on a np.ndarray. + + This is a pure Python implementation of (some of) the logic in this NumPy + proposal: https://github.com/numpy/numpy/pull/6256 + """ + def __init__(self, array): + self._array = array + + def __getitem__(self, key): + mixed_positions, vindex_positions = _advanced_indexer_subspaces(key) + return moveaxis(self._array[key], mixed_positions, vindex_positions) + + def __setitem__(self, key, value): + """Value must have dimensionality matching the key.""" + mixed_positions, vindex_positions = _advanced_indexer_subspaces(key) + self._array[key] = moveaxis(value, vindex_positions, mixed_positions) diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py new file mode 100644 index 00000000000..062885ae155 --- /dev/null +++ b/xarray/tests/test_nputils.py @@ -0,0 +1,30 @@ +import numpy as np +from numpy.testing import assert_array_equal + +from xarray.core.nputils import _is_contiguous, VectorizedIndex + + +def test_is_contiguous(): + assert _is_contiguous([1]) + assert _is_contiguous([1, 2, 3]) + assert not _is_contiguous([1, 3]) + + +def test_vindex(): + x = np.arange(3 * 4 * 5).reshape((3, 4, 5)) + vindex = VectorizedIndex(x) + + # getitem + assert_array_equal(vindex[0], x[0]) + assert_array_equal(vindex[[1, 2], [1, 2]], x[[1, 2], [1, 2]]) + assert vindex[[0, 1], [0, 1], :].shape == (2, 5) + assert vindex[[0, 1], :, [0, 1]].shape == (2, 4) + assert vindex[:, [0, 1], [0, 1]].shape == (2, 3) + + # setitem + vindex[:] = 0 + assert_array_equal(x, np.zeros_like(x)) + # assignment should not raise + vindex[[0, 1], [0, 1], :] = vindex[[0, 1], [0, 1], :] + vindex[[0, 1], :, [0, 1]] = vindex[[0, 1], :, [0, 1]] + vindex[:, [0, 1], [0, 1]] = vindex[:, [0, 1], [0, 1]] From 19f720462af76b89fdd7197b7bf9763e0c3520a7 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Aug 2017 00:13:41 -0700 Subject: [PATCH 049/113] Add an OrderedSet to xarray.core.utils --- xarray/core/utils.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 89d1462328c..d31d6692c33 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -8,7 +8,7 @@ import itertools import re import warnings -from collections import Mapping, MutableMapping, Iterable +from collections import Mapping, MutableMapping, MutableSet, Iterable import numpy as np import pandas as pd @@ -378,6 +378,43 @@ def __len__(self): raise len(iter(self)) +class OrderedSet(MutableSet): + """A simple ordered set. + + The API matches the builtin set, but it preserves insertion order of + elements, like an OrderedDict. + """ + def __init__(self, values=None): + self._ordered_dict = OrderedDict() + if values is not None: + self |= values + + # Required methods for MutableSet + + def __contains__(self, value): + return value in self._ordered_dict + + def __iter__(self): + return iter(self._ordered_dict) + + def __len__(self): + return len(self._ordered_dict) + + def add(self, value): + self._ordered_dict[value] = None + + def discard(self, value): + del self._ordered_dict[value] + + # Additional methods + + def update(self, values): + self |= values + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, list(self)) + + class NdimSizeLenMixin(object): """Mixin class that extends a class that defines a ``shape`` property to one that also defines ``ndim``, ``size`` and ``__len__``. From a8f60baa4acf67233848f464ea82604d8e34292c Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Aug 2017 00:15:09 -0700 Subject: [PATCH 050/113] Support dask and numpy vindex with one path --- xarray/backends/scipy_.py | 4 - xarray/core/indexing.py | 143 ++++++++++++++--------------- xarray/core/variable.py | 168 +++++++++++++++++----------------- xarray/tests/test_variable.py | 98 +++++++------------- 4 files changed, 189 insertions(+), 224 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 1e875d0f858..5b5dcd461f9 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -46,10 +46,6 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name].data def __getitem__(self, key): - if isinstance(key, OuterIndexer): - key = key.vectorize(self.shape) - - key = to_tuple(key) with self.datastore.ensure_open(autoclose=True): data = NumpyIndexingAdapter(self.get_array())[key] # Copy data if the source file is mmapped. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 02e5bc5ff8f..2a75653a863 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from . import nputils from . import utils from .npcompat import moveaxis from .pycompat import (iteritems, range, integer_types, dask_array_type, @@ -283,32 +284,6 @@ class OuterIndexer(IndexerTuple): """ Tuple for outer/orthogonal indexing. All the item is one of integer, slice, and 1d-np.ndarray. """ - def vectorize(self, shape): - """ Convert to a vectorized indexer. - shape: shape of the array subject to the indexing. - """ - if len([k for k in self if not isinstance(k, slice)]) <= 1: - # if there is only one vector and all others are slice, - # it can be safely converted to vectorized indexer - # Boolean index should be converted to integer array. - return VectorizedIndexer(self) - else: - n_dim = len([k for k in self if not isinstance(k, integer_types)]) - i_dim = 0 - new_key = [] - for k, size in zip(self, shape): - if isinstance(k, integer_types): - new_key.append(k) - else: # np.ndarray or slice - if isinstance(k, slice): - k = np.arange(*k.indices(size)) - if k.dtype.kind == 'b': - (k, ) = k.nonzero() - shape = [(1,) * i_dim + (k.size, ) + - (1,) * (n_dim - i_dim - 1)] - new_key.append(k.reshape(*shape)) - i_dim += 1 - return VectorizedIndexer(new_key) class VectorizedIndexer(IndexerTuple): @@ -444,6 +419,44 @@ def xarray_indexable(array): return array +def _outer_to_numpy_indexer(key, shape): + """Convert an OuterIndexer into an indexer for NumPy. + + Parameters + ---------- + key : OuterIndexer + Outer indexing tuple to convert. + shape : tuple + Shape of the array subject to the indexing. + + Returns + ------- + tuple + Base tuple suitable for use to index a NumPy array. + """ + if len([k for k in key if not isinstance(k, slice)]) <= 1: + # If there is only one vector and all others are slice, + # it can be safely used in mixed basic/advanced indexing. + # Boolean index should already be converted to integer array. + return tuple(key) + + n_dim = len([k for k in key if not isinstance(k, integer_types)]) + i_dim = 0 + new_key = [] + for k, size in zip(key, shape): + if isinstance(k, integer_types): + new_key.append(k) + else: # np.ndarray or slice + if isinstance(k, slice): + k = np.arange(*k.indices(size)) + assert k.dtype.kind == 'i' + shape = [(1,) * i_dim + (k.size, ) + + (1,) * (n_dim - i_dim - 1)] + new_key.append(k.reshape(*shape)) + i_dim += 1 + return tuple(new_key) + + class NumpyIndexingAdapter(utils.NDArrayMixin): """Wrap a NumPy array to use broadcasted indexing """ @@ -459,17 +472,24 @@ def _ensure_ndarray(self, value): value = utils.to_0d_array(value) return value - def __getitem__(self, key): + def _indexing_array_and_key(self, key): if isinstance(key, OuterIndexer): - key = key.vectorize(self.shape) - key = to_tuple(key) - return self._ensure_ndarray(self.array[key]) + key = _outer_to_numpy_indexer(key, self.array.shape) + + if isinstance(key, VectorizedIndexer): + array = nputils.VectorizedIndex(self.array) + else: + array = self.array + + return array, to_tuple(key) + + def __getitem__(self, key): + array, key = self._indexing_array_and_key(key) + return self._ensure_ndarray(array[key]) def __setitem__(self, key, value): - if isinstance(key, OuterIndexer): - key = key.vectorize(self.shape) - key = to_tuple(key) - self.array[key] = value + array, key = self._indexing_array_and_key(key) + array[key] = value class DaskIndexingAdapter(utils.NDArrayMixin): @@ -482,51 +502,28 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - # should always get PointwiseIndexer instead - assert not isinstance(key, VectorizedIndexer) - - if isinstance(key, PointwiseIndexer): - return self._getitem_pointwise(key) - - try: - key = to_tuple(key) - return self.array[key] - except NotImplementedError: - # manual orthogonal indexing. - value = self.array - for axis, subkey in reversed(list(enumerate(key))): - value = value[(slice(None),) * axis + (subkey,)] - return value - - def _getitem_pointwise(self, key): - pointwise_shape, pointwise_index = next( - (k.shape, i) for i, k in enumerate(key) - if not isinstance(k, slice)) - # dask's indexing only handles 1d arrays - flat_key = tuple(k if isinstance(k, slice) else k.ravel() - for k in key) - - if len([k for k in key if not isinstance(k, slice)]) == 1: - # vindex requires more than one non-slice :( - # but we can use normal indexing instead - indexed = self.array[flat_key] - new_shape = (indexed.shape[:pointwise_index] + - pointwise_shape + - indexed.shape[pointwise_index + 1:]) - return indexed.reshape(new_shape) + if isinstance(key, BasicIndexer): + return self.array[tuple(key)] + elif isinstance(key, VectorizedIndexer): + return self.array.vindex[tuple(key)] else: - indexed = self.array.vindex[flat_key] - # vindex always moves slices to the end - reshaped = indexed.reshape(pointwise_shape + indexed.shape[1:]) - # reorder dimensions to match order of appearance - positions = np.arange(0, len(pointwise_shape)) - return moveaxis(reshaped, positions, positions + pointwise_index) + assert isinstance(key, OuterIndexer) + key = tuple(key) + try: + return self.array[key] + except NotImplementedError: + # manual orthogonal indexing. + # TODO: port this upstream into dask in a saner way. + value = self.array + for axis, subkey in reversed(list(enumerate(key))): + value = value[(slice(None),) * axis + (subkey,)] + return value def __setitem__(self, key, value): raise TypeError("this variable's data is stored in a dask array, " 'which does not support item assignment. To ' 'assign to this variable, you must first load it ' - 'into memory explicitly using the .load_data() ' + 'into memory explicitly using the .load() ' 'method or accessing its .values attribute.') diff --git a/xarray/core/variable.py b/xarray/core/variable.py index abdb630909e..875f31061d8 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -16,10 +16,12 @@ from . import nputils from . import ops from . import utils +from .npcompat import moveaxis from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) from .indexing import (PandasIndexAdapter, xarray_indexable, BasicIndexer, OuterIndexer, PointwiseIndexer, VectorizedIndexer) +from .utils import OrderedSet import xarray as xr # only for Dataset and DataArray @@ -388,19 +390,23 @@ def _item_key_to_tuple(self, key): return key def _broadcast_indexes(self, key): - """ + """Prepare an indexing key for an indexing operation. + Parameters ----------- - key: One of - array - a mapping of dimension names to index. + key: int, slice, array, dict or tuple of integer, slices and arrays + Any valid input for indexing. Returns ------- - dims: Tuple of strings. + dims: tuple Dimension of the resultant variable. - indexers: list of integer, array-like, or slice. This is aligned - along self.dims. + indexers: IndexingTuple subclass + Tuple of integer, array-like, or slices to use when indexing + self._data. The type of this argument indicates the type of + indexing to perform, either basic, outer or vectorized. + new_order : Optional[Sequence[int]] + Optional reordering to do on the result of indexing. """ key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size @@ -420,7 +426,7 @@ def _broadcast_indexes(self, key): for k, d in zip(key, self.dims): if isinstance(k, Variable): if len(k.dims) > 1: - return self._broadcast_indexes_advanced(key) + return self._broadcast_indexes_vectorized(key) dims.append(k.dims[0]) if not isinstance(k, integer_types): dims.append(d) @@ -428,12 +434,12 @@ def _broadcast_indexes(self, key): if len(set(dims)) == len(dims): return self._broadcast_indexes_outer(key) - return self._broadcast_indexes_advanced(key) + return self._broadcast_indexes_vectorized(key) def _broadcast_indexes_basic(self, key): dims = tuple(dim for k, dim in zip(key, self.dims) if not isinstance(k, integer_types)) - return dims, BasicIndexer(key) + return dims, BasicIndexer(key), None def _broadcast_indexes_outer(self, key): dims = tuple(k.dims[0] if isinstance(k, Variable) else dim @@ -453,7 +459,7 @@ def _broadcast_indexes_outer(self, key): "cannot be used for indexing: {}".format( k)) indexer.append(k if k.dtype.kind != 'b' else np.flatnonzero(k)) - return dims, OuterIndexer(indexer) + return dims, OuterIndexer(indexer), None def _nonzero(self): """ Equivalent numpy's nonzero but returns a tuple of Varibles. """ @@ -463,77 +469,61 @@ def _nonzero(self): return tuple(Variable((dim), nz) for nz, dim in zip(nonzeros, self.dims)) - def _broadcast_indexes_advanced(self, key): - if isinstance(self._data, dask_array_type): - # dask only supports a very restricted form of advanced indexing - return self._broadcast_indexes_dask_pointwise(key) + def _broadcast_indexes_vectorized(self, key): variables = [] + out_dims_set = OrderedSet() for dim, value in zip(self.dims, key): if isinstance(value, slice): - value = np.arange(*value.indices(self.sizes[dim])) - - try: - variable = as_variable(value, name=dim) - except MissingDimensionsError: # change to better exception - raise IndexError("Unlabelled multi-dimensional array " - "cannot be used for indexing.") - - if variable.dtype.kind == 'b': # boolean indexing case - if variable.ndim > 1: - raise IndexError("{}-dimensional boolean indexing is " - "not supported. ".format(variable.ndim)) - variables.extend(variable._nonzero()) + out_dims_set.add(dim) else: + try: + variable = as_variable(value, name=dim) + except MissingDimensionsError: # change to better exception + raise IndexError("Unlabelled multi-dimensional array " + "cannot be used for indexing.") + + if variable.dtype.kind == 'b': # boolean indexing case + if variable.ndim > 1: + raise IndexError("{}-dimensional boolean indexing is " + "not supported. ".format(variable.ndim)) + (variable,) = variable._nonzero() + variables.append(variable) + out_dims_set.update(variable.dims) + + variable_dims = set() + for variable in variables: + variable_dims.update(variable.dims) + + slices = [] + for i, (dim, value) in enumerate(zip(self.dims, key)): + if isinstance(value, slice): + if dim in variable_dims: + # We only convert slice objects to variables if they share + # a dimension with at least one other variable. Otherwise, + # we can equivalently leave them as slices and transpose the + # result. This is significantly faster/more efficient for + # most array backends. + values = np.arange(*value.indices(self.sizes[dim])) + variables.insert(i, Variable((dim,), values)) + else: + slices.append((i, value)) + try: variables = _broadcast_compat_variables(*variables) except ValueError: raise IndexError("Dimensions of indexers mismatch: {}".format(key)) - dims = variables[0].dims # all variables have the same dims - # overwrite if there is integers - key = VectorizedIndexer(variable.data for variable in variables) - return dims, key - - def _broadcast_indexes_dask_pointwise(self, key): - if any(not isinstance(k, (Variable, slice)) for k in key): - raise IndexError( - 'Vectorized indexing with dask requires that all indexers are ' - 'labeled arrays or full slice objects: {}'.format(key)) - - if any(isinstance(k, Variable) and k.dtype.kind == 'b' for k in key): - raise IndexError( - 'Vectorized indexing with dask does not support booleans: {}' - .format(key)) - - dims_set = {k.dims for k in key if isinstance(k, Variable)} - if len(dims_set) != 1: - raise IndexError( - 'Vectorized indexing with dask requires that all labeled ' - 'arrays in the indexer have the same dimension names, but ' - 'arrays have different dimensions: {}'.format(key)) - (unique_dims,) = dims_set - - shapes_set = {k.shape for k in key if isinstance(k, Variable)} - if len(shapes_set) != 1: - # matches message in _broadcast_indexes_advanced - raise IndexError("Dimensions of indexers mismatch: {}".format(key)) - dims = [] - found_first_array = False - for k, d in zip(key, self.dims): - if isinstance(k, slice): - if d in unique_dims: - raise IndexError( - 'Labeled arrays used in vectorized indexing with dask ' - 'cannot reuse a sliced dimension: {}'.format(d)) - dims.append(d) - elif not found_first_array: - dims.extend(k.dims) - found_first_array = True + out_key = [variable.data for variable in variables] + out_dims = tuple(out_dims_set) + reorder = [] + for i, value in slices: + out_key.insert(i, value) + new_position = out_dims.index(self.dims[i]) + reorder.append(new_position) - key = PointwiseIndexer(getattr(k, 'data', k) for k in key) - return tuple(dims), key + return out_dims, VectorizedIndexer(out_key), reorder def __getitem__(self, key): """Return a new Array object whose contents are consistent with @@ -549,12 +539,11 @@ def __getitem__(self, key): If you really want to do indexing like `x[x > 0]`, manipulate the numpy array `x.values` directly. """ - dims, index_tuple = self._broadcast_indexes(key) + dims, index_tuple, new_order = self._broadcast_indexes(key) data = self._indexable_data[index_tuple] - if hasattr(data, 'ndim'): - assert data.ndim == len(dims), (data.ndim, len(dims)) - else: - assert len(dims) == 0, len(dims) + if new_order: + data = moveaxis(data, -(1 + np.arange(len(new_order))), new_order) + assert getattr(data, 'ndim', 0) == len(dims), (data.ndim, len(dims)) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) @@ -564,12 +553,23 @@ def __setitem__(self, key, value): See __getitem__ for more details. """ - dims, index_tuple = self._broadcast_indexes(key) - data = xarray_indexable(self._data) + dims, index_tuple, new_order = self._broadcast_indexes(key) + if isinstance(value, Variable): - data[index_tuple] = value.set_dims(dims) - else: - data[index_tuple] = value + value = value.set_dims(dims).data + + if new_order: + value = duck_array_ops.asarray(value) + if value.ndim > len(dims): + raise ValueError( + 'shape mismatch: value array of shape %s could not be' + 'broadcast to indexing result with %s dimensions' + % (value.shape, len(dims))) + + value = value[(len(dims) - value.ndim) * (np.newaxis,) + (Ellipsis,)] + value = moveaxis(value, new_order, -(1 + np.arange(len(new_order)))) + + self._indexable_data[index_tuple] = value @property def attrs(self): @@ -910,8 +910,8 @@ def set_dims(self, dims, shape=None): missing_dims = set(self.dims) - set(dims) if missing_dims: - raise ValueError('new dimensions must be a superset of existing ' - 'dimensions') + raise ValueError('new dimensions %r must be a superset of existing ' + 'dimensions %r' % (dims, self.dims)) self_dims = set(self.dims) expanded_dims = tuple( @@ -1370,12 +1370,12 @@ def chunk(self, chunks=None, name=None, lock=False): return self.copy(deep=False) def __getitem__(self, key): - dims, index_tuple = self._broadcast_indexes(key) + dims, index_tuple, _ = self._broadcast_indexes(key) if len(dims) > 1: raise IndexError('Multiple dimension array cannot be used for ' 'indexing IndexVariable: {}'.format(key)) values = self._indexable_data[index_tuple] - if not hasattr(values, 'ndim') or values.ndim == 0: + if getattr(values, 'ndim', 0) == 0: return Variable((), values, self._attrs, self._encoding) else: return type(self)(dims, values, self._attrs, diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c9896623c1f..085bb70e2dc 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -524,14 +524,13 @@ def test_getitem_advanced(self): self.assertArrayEqual(v_new, v_data[:, 0:1]) def test_getitem_fancy(self): - # dask-based Variable objects don't support this full set of operations. v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) v_data = v.compute().data ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) v_new = v[ind] assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v_data[([0, 1, 1], [1, 1, 0]), :]) + self.assertArrayEqual(v_new, v_data[[[0, 1, 1], [1, 1, 0]], :]) ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) v_new = v[dict(y=ind)] @@ -572,6 +571,12 @@ def test_getitem_fancy(self): assert v_new.dims == ('a', 'b', 'y') self.assertArrayEqual(v_new, v_data[[[0, 0], [1, 1]], slice(None, 1)]) + # slice matches explicit dimension + ind = Variable(['y'], [0, 1]) + v_new = v[ind, :2] + assert v_new.dims == ('y',) + self.assertArrayEqual(v_new, v_data[[0, 1], [0, 1]]) + def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) @@ -584,10 +589,7 @@ def test_getitem_error(self): v[ind_x, ind_y] ind = Variable(['a', 'b'], [[True, False], [False, True]]) - msg = ('dask does not support booleans' - if type(self) is TestVariable_withDask - else '2-dimensional boolean') - with self.assertRaisesRegexp(IndexError, msg): + with self.assertRaisesRegexp(IndexError, '2-dimensional boolean'): v[dict(x=ind)] @@ -804,38 +806,38 @@ def test_detect_indexer_type(self): data = np.random.random((10, 11)) v = Variable(['x', 'y'], data) - _, ind = v._broadcast_indexes((0, 1)) + _, ind, _ = v._broadcast_indexes((0, 1)) assert type(ind) == indexing.BasicIndexer - _, ind = v._broadcast_indexes((0, slice(0, 8, 2))) + _, ind, _ = v._broadcast_indexes((0, slice(0, 8, 2))) assert type(ind) == indexing.BasicIndexer - _, ind = v._broadcast_indexes((0, [0, 1])) + _, ind, _ = v._broadcast_indexes((0, [0, 1])) assert type(ind) == indexing.OuterIndexer - _, ind = v._broadcast_indexes(([0, 1], 1)) + _, ind, _ = v._broadcast_indexes(([0, 1], 1)) assert type(ind) == indexing.OuterIndexer - _, ind = v._broadcast_indexes(([0, 1], [1, 2])) + _, ind, _ = v._broadcast_indexes(([0, 1], [1, 2])) assert type(ind) == indexing.OuterIndexer - _, ind = v._broadcast_indexes(([0, 1], slice(0, 8, 2))) + _, ind, _ = v._broadcast_indexes(([0, 1], slice(0, 8, 2))) assert type(ind) == indexing.OuterIndexer vind = Variable(('a', ), [0, 1]) - _, ind = v._broadcast_indexes((vind, slice(0, 8, 2))) + _, ind, _ = v._broadcast_indexes((vind, slice(0, 8, 2))) assert type(ind) == indexing.OuterIndexer vind = Variable(('y', ), [0, 1]) - _, ind = v._broadcast_indexes((vind, 3)) + _, ind, _ = v._broadcast_indexes((vind, 3)) assert type(ind) == indexing.OuterIndexer vind = Variable(('a', ), [0, 1]) - _, ind = v._broadcast_indexes((vind, vind)) + _, ind, _ = v._broadcast_indexes((vind, vind)) assert type(ind) == indexing.VectorizedIndexer vind = Variable(('a', 'b'), [[0, 2], [1, 3]]) - _, ind = v._broadcast_indexes((vind, 3)) + _, ind, _ = v._broadcast_indexes((vind, 3)) assert type(ind) == indexing.VectorizedIndexer def test_items(self): @@ -1291,6 +1293,15 @@ def test_setitem(self): self.assertTrue(v[0, 1] == 1) # dimension broadcast + v = Variable(['x', 'y'], np.ones((3, 2))) + ind = Variable(['a', 'b'], [[0, 1]]) + v[ind, :] = 0 + expected = Variable(['x', 'y'], [[0, 0], [0, 0], [1, 1]]) + self.assertVariableIdentical(expected, v) + + with self.assertRaisesRegexp(ValueError, "shape mismatch"): + v[ind, ind] = np.zeros((1, 2, 1)) + v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) ind = Variable(['a'], [0, 1]) v[dict(x=ind)] = Variable(['a', 'y'], np.ones((2, 3), dtype=int) * 10) @@ -1299,69 +1310,30 @@ def test_setitem(self): @requires_dask -class TestVariable_withDask(TestCase, VariableSubclassTestCases): +class TestVariableWithDask(TestCase, VariableSubclassTestCases): cls = staticmethod(lambda *args: Variable(*args).chunk()) @pytest.mark.xfail def test_0d_object_array_with_list(self): - super(TestVariable_withDask, self).test_0d_object_array_with_list() + super(TestVariableWithDask, self).test_0d_object_array_with_list() @pytest.mark.xfail def test_array_interface(self): # dask array does not have `argsort` - super(TestVariable_withDask, self).test_array_interface() + super(TestVariableWithDask, self).test_array_interface() @pytest.mark.xfail def test_copy_index(self): - super(TestVariable_withDask, self).test_copy_index() + super(TestVariableWithDask, self).test_copy_index() @pytest.mark.xfail def test_eq_all_dtypes(self): - super(TestVariable_withDask, self).test_eq_all_dtypes() - - @pytest.mark.xfail - def test_getitem_fancy(self): - super(TestVariable_withDask, self).test_getitem_fancy() + super(TestVariableWithDask, self).test_eq_all_dtypes() def test_getitem_fancy(self): - # selectively copied from the superclass. - v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - v_data = v.compute().data - - ind = Variable(['a', 'b'], [[0, 1, 1], [1, 1, 0]]) - v_new = v[ind] - assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v_data[([0, 1, 1], [1, 1, 0]), :]) - - ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) - v_new = v[dict(y=ind)] - assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v_data[:, ([0, 1, 2], [2, 1, 0])]) - - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - with self.assertRaisesRegexp(IndexError, 'same dimension names'): - v_new = v[dict(x=ind[:, 0], y=ind)] - - # along diagonal - ind = Variable(['a'], [0, 1]) - v_new = v[ind, ind] - assert v_new.dims == ('a',) - self.assertArrayEqual(v_new, v_data[[0, 1], [0, 1]]) - - # with slice - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=slice(None), y=ind)] - assert v_new.dims == ('x', 'a', 'b') - self.assertArrayEqual(v_new, v_data[:, [[0, 0], [1, 1]]]) - - ind = Variable(['a', 'b'], [[0, 0], [1, 1]]) - v_new = v[dict(x=ind, y=slice(None))] - assert v_new.dims == ('a', 'b', 'y') - self.assertArrayEqual(v_new, v_data[[[0, 0], [1, 1]], :]) - - ind = Variable(['x', 'b'], [[0, 0], [1, 1]]) - with self.assertRaisesRegexp(IndexError, 'reuse a sliced dimension'): - v_new = v[dict(x=slice(None), y=ind)] + if LooseVersion(dask.__version__) <= LooseVersion('0.15.1'): + pytest.xfail("vindex from latest dask is required") + super(TestVariableWithDask, self).test_getitem_fancy() class TestIndexVariable(TestCase, VariableSubclassTestCases): From 69f85705b9a25e3789bfc6e228e9dd9cb48a1819 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Aug 2017 00:50:40 -0700 Subject: [PATCH 051/113] Fix test failures --- xarray/core/variable.py | 20 ++++++++++++++------ xarray/tests/test_indexing.py | 29 +++++++++++++++++++---------- xarray/tests/test_variable.py | 8 ++++++++ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 875f31061d8..e84b9461733 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -406,7 +406,9 @@ def _broadcast_indexes(self, key): self._data. The type of this argument indicates the type of indexing to perform, either basic, outer or vectorized. new_order : Optional[Sequence[int]] - Optional reordering to do on the result of indexing. + Optional reordering to do on the result of indexing. If not None, + the first len(new_order) indexing should be moved to these + positions. """ key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size @@ -517,13 +519,19 @@ def _broadcast_indexes_vectorized(self, key): out_key = [variable.data for variable in variables] out_dims = tuple(out_dims_set) - reorder = [] + slice_positions = set() for i, value in slices: out_key.insert(i, value) new_position = out_dims.index(self.dims[i]) - reorder.append(new_position) + slice_positions.add(new_position) - return out_dims, VectorizedIndexer(out_key), reorder + if slice_positions: + new_order = [i for i in range(len(out_dims)) + if i not in slice_positions] + else: + new_order = None + + return out_dims, VectorizedIndexer(out_key), new_order def __getitem__(self, key): """Return a new Array object whose contents are consistent with @@ -542,7 +550,7 @@ def __getitem__(self, key): dims, index_tuple, new_order = self._broadcast_indexes(key) data = self._indexable_data[index_tuple] if new_order: - data = moveaxis(data, -(1 + np.arange(len(new_order))), new_order) + data = moveaxis(data, range(len(new_order)), new_order) assert getattr(data, 'ndim', 0) == len(dims), (data.ndim, len(dims)) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) @@ -567,7 +575,7 @@ def __setitem__(self, key, value): % (value.shape, len(dims))) value = value[(len(dims) - value.ndim) * (np.newaxis,) + (Ellipsis,)] - value = moveaxis(value, new_order, -(1 + np.arange(len(new_order)))) + value = moveaxis(value, new_order, range(len(new_order))) self._indexable_data[index_tuple] = value diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 249b745e6e9..ed80ca7aad7 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -1,11 +1,15 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + import numpy as np import pandas as pd from xarray import Dataset, DataArray, Variable from xarray.core import indexing +from xarray.core import nputils +from xarray.core.npcompat import moveaxis from . import TestCase, ReturnItem @@ -224,8 +228,8 @@ def test_index_scalar(self): class TestIndexerTuple(TestCase): - """ Make sure OuterIndexer.vectorize gives similar result to - v._broadcast_indexes_advanced + """ Make sure _outer_to_numpy_indexer gives similar result to + Variable._broadcast_indexes_vectorized """ def test_outer_indexer(self): def nonzero(x): @@ -238,11 +242,16 @@ def nonzero(x): # test orthogonally applied indexers indexers = [I[:], 0, -2, I[:3], np.array([0, 1, 2, 3]), np.array([0]), np.arange(10) < 5] - for i in indexers: - for j in indexers: - for k in indexers: - outer_index = indexing.OuterIndexer( - (nonzero(i), nonzero(j), nonzero(k))) - _, expected = v._broadcast_indexes_advanced((i, j, k)) - actual = outer_index.vectorize(v.shape) - self.assertArrayEqual(v.data[actual], v.data[expected]) + for i, j, k in itertools.product(indexers, repeat=3): + + _, expected, new_order = v._broadcast_indexes_vectorized((i, j, k)) + expected_data = nputils.VectorizedIndex(v.data)[expected] + if new_order: + old_order = range(len(new_order)) + expected_data = moveaxis(expected_data, old_order, new_order) + + outer_index = indexing.OuterIndexer( + (nonzero(i), nonzero(j), nonzero(k))) + actual = indexing._outer_to_numpy_indexer(outer_index, v.shape) + actual_data = v.data[actual] + self.assertArrayEqual(actual_data, expected_data) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 085bb70e2dc..f0fdc71b483 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -577,6 +577,13 @@ def test_getitem_fancy(self): assert v_new.dims == ('y',) self.assertArrayEqual(v_new, v_data[[0, 1], [0, 1]]) + # with multiple slices + v = self.cls(['x', 'y', 'z'], [[[1, 2, 3], [4, 5, 6]]]) + ind = Variable(['a', 'b'], [[0]]) + v_new = v[ind, :, :] + expected = Variable(['a', 'b', 'y', 'z'], v.data[np.newaxis, ...]) + self.assertVariableIdentical(v_new, expected) + def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) @@ -1331,6 +1338,7 @@ def test_eq_all_dtypes(self): super(TestVariableWithDask, self).test_eq_all_dtypes() def test_getitem_fancy(self): + import dask if LooseVersion(dask.__version__) <= LooseVersion('0.15.1'): pytest.xfail("vindex from latest dask is required") super(TestVariableWithDask, self).test_getitem_fancy() From 5eb00b7866dcffe8662dbd847f3f00a43b9369a7 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 30 Jul 2017 23:13:12 +0900 Subject: [PATCH 052/113] working with `Dataset.sel` --- xarray/core/dataset.py | 131 ++++++++++++----------------------- xarray/core/indexing.py | 17 ++++- xarray/tests/test_dataset.py | 92 ++++++++++++++---------- 3 files changed, 114 insertions(+), 126 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 48e5a5bc2aa..3f9241c831b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1220,6 +1220,26 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): Dataset.isel_points DataArray.sel """ + from .dataarray import DataArray + + for k, v in indexers.items(): + if isinstance(v, tuple): + if (k in self.indexes and + (isinstance(self.indexes[k].data, pd.MultiIndex) or + self.indexes[k].dtype == 'object')): + # If array dtype is tuple, we should be carefully check + # whether indexer should be Variable or not. + try: + v_tmp = as_variable(v) + # TODO should check dtype consistency + indexers[k] = v_tmp + except (ValueError, TypeError): + pass + else: + indexers[k] = as_variable(v) + elif isinstance(v, DataArray): + indexers[k] = v.variable + pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) @@ -1265,98 +1285,29 @@ def isel_points(self, dim='points', **indexers): DataArray.isel_points """ - indexer_dims = set(indexers) - - def take(variable, slices): - # Note: remove helper function when once when numpy - # supports vindex https://github.com/numpy/numpy/pull/6075 - if hasattr(variable.data, 'vindex'): - # Special case for dask backed arrays to use vectorised list indexing - sel = variable.data.vindex[slices] - else: - # Otherwise assume backend is numpy array with 'fancy' indexing - sel = variable.data[slices] - return sel - - def relevant_keys(mapping): - return [k for k, v in mapping.items() - if any(d in indexer_dims for d in v.dims)] - - coords = relevant_keys(self.coords) - indexers = [(k, np.asarray(v)) for k, v in iteritems(indexers)] - indexers_dict = dict(indexers) - non_indexed_dims = set(self.dims) - indexer_dims - non_indexed_coords = set(self.coords) - set(coords) - - # All the indexers should be iterables - # Check that indexers are valid dims, integers, and 1D - for k, v in indexers: - if k not in self.dims: - raise ValueError("dimension %s does not exist" % k) - if v.dtype.kind != 'i': - raise TypeError('Indexers must be integers') - if v.ndim != 1: - raise ValueError('Indexers must be 1 dimensional') - - # all the indexers should have the same length - lengths = set(len(v) for k, v in indexers) - if len(lengths) > 1: - raise ValueError('All indexers must be the same length') - - # Existing dimensions are not valid choices for the dim argument - if isinstance(dim, basestring): - if dim in self.dims: - # dim is an invalid string - raise ValueError('Existing dimension names are not valid ' - 'choices for the dim argument in sel_points') - - elif hasattr(dim, 'dims'): - # dim is a DataArray or Coordinate - if dim.name in self.dims: - # dim already exists - raise ValueError('Existing dimensions are not valid choices ' - 'for the dim argument in sel_points') - - # Set the new dim_name, and optionally the new dim coordinate - # dim is either an array-like or a string - if not utils.is_scalar(dim): - # dim is array like get name or assign 'points', get as variable - dim_name = 'points' if not hasattr(dim, 'name') else dim.name - dim_coord = as_variable(dim, name=dim_name) - else: - # dim is a string - dim_name = dim - dim_coord = None - - reordered = self.transpose(*(list(indexer_dims) + list(non_indexed_dims))) + import warnings + warnings.warn('Dataset.isel_points is deprecated: use Dataset.isel()' + 'instead', DeprecationWarning, stacklevel=2) - variables = OrderedDict() + from .dataarray import DataArray + if isinstance(dim, DataArray): + indexers = {k: DataArray(v, dims=[dim.name], coords=dim.coords) + for k, v in iteritems(indexers)} + return self.isel(**indexers) - for name, var in reordered.variables.items(): - if name in indexers_dict or any(d in indexer_dims for d in var.dims): - # slice if var is an indexer or depends on an indexed dim - slc = [indexers_dict[k] - if k in indexers_dict - else slice(None) for k in var.dims] - - var_dims = [dim_name] + [d for d in var.dims - if d in non_indexed_dims] - selection = take(var, tuple(slc)) - var_subset = type(var)(var_dims, selection, var.attrs) - variables[name] = var_subset - else: - # If not indexed just add it back to variables or coordinates - variables[name] = var + if isinstance(dim, (list, np.ndarray)): + indexers = {k: DataArray(v, dims=['points'], + coords={'points': dim}) + for k, v in iteritems(indexers)} + return self.isel(**indexers) - coord_names = (set(coords) & set(variables)) | non_indexed_coords + if isinstance(dim, pd.Index): + indexers = {k: DataArray(v, dims=[dim.name, ], + coords={dim.name: dim}) + for k, v in iteritems(indexers)} + return self.isel(**indexers) - dset = self._replace_vars_and_dims(variables, coord_names=coord_names) - # Add the dim coord to the new dset. Must be done after creation - # because_replace_vars_and_dims can only access existing coords, - # not add new ones - if dim_coord is not None: - dset.coords[dim_name] = dim_coord - return dset + return self.isel(**{k: ((dim, ), v) for k, v in iteritems(indexers)}) def sel_points(self, dim='points', method=None, tolerance=None, **indexers): @@ -1410,6 +1361,10 @@ def sel_points(self, dim='points', method=None, tolerance=None, Dataset.isel_points DataArray.sel_points """ + import warnings + warnings.warn('Dataset.sel_points is deprecated: use Dataset.sel()' + 'instead', DeprecationWarning, stacklevel=2) + pos_indexers, _ = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 02e5bc5ff8f..d6889e9a2fc 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -95,7 +95,20 @@ def get_loc(index, label, method=None, tolerance=None): def get_indexer(index, labels, method=None, tolerance=None): + """ Call pd.Index.get_indexer(labels). If labels are Variable, + The return type is also a Variable with the same dimension to + labels. + """ + from .variable import Variable + kwargs = _index_method_kwargs(method, tolerance) + if isinstance(labels, Variable): + if labels.ndim > 1: + indexers = np.array(index.get_indexer(labels.data.flatten(), + **kwargs)) + return Variable(labels.dims, indexers.reshape(labels.shape)) + else: + return Variable(labels.dims, index.get_indexer(labels, **kwargs)) return index.get_indexer(labels, **kwargs) @@ -145,14 +158,14 @@ def convert_label_indexer(index, label, index_name='', method=None, ) else: - label = _asarray_tuplesafe(label) + label = label if hasattr(label, 'dims') else _asarray_tuplesafe(label) if label.ndim == 0: if isinstance(index, pd.MultiIndex): indexer, new_index = index.get_loc_level(label.item(), level=0) else: indexer = get_loc(index, label.item(), method, tolerance) elif label.dtype.kind == 'b': - indexer, = np.nonzero(label) + indexer = label else: indexer = get_indexer(index, label, method, tolerance) if np.any(indexer < 0): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 17836c0c7ca..7e7f4eaa9d9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -853,47 +853,49 @@ def test_isel_fancy(self): pdim1 = [1, 2, 3] pdim2 = [4, 5, 1] pdim3 = [1, 2, 3] - actual = data.isel(dim1=(('test_coord'), pdim1), - dim2=(('test_coord'), pdim2), - dim3=(('test_coord'), pdim3)) + actual = data.isel(dim1=(('test_coord', ), pdim1), + dim2=(('test_coord', ), pdim2), + dim3=(('test_coord', ), pdim3)) assert 'test_coord' in actual.dims assert actual.coords['test_coord'].shape == (len(pdim1), ) # Should work with DataArray actual = data.isel(dim1=DataArray(pdim1, dims='test_coord'), - dim2=(('test_coord'), pdim2), - dim3=(('test_coord'), pdim3)) + dim2=(('test_coord', ), pdim2), + dim3=(('test_coord', ), pdim3)) assert 'test_coord' in actual.dims assert actual.coords['test_coord'].shape == (len(pdim1), ) - actual = data.isel(dim1=(('points'), pdim1), dim2=(('points'), pdim2)) + actual = data.isel(dim1=(('points', ), pdim1), + dim2=(('points', ), pdim2)) assert 'points' in actual.dims assert 'dim3' in actual.dims assert 'dim3' not in actual.data_vars np.testing.assert_array_equal(data['dim2'][pdim2], actual['dim2']) # test that the order of the indexers doesn't matter - self.assertDatasetIdentical(data.isel(dim1=(('points'), pdim1), - dim2=(('points'), pdim2)), - data.isel(dim2=(('points'), pdim2), - dim1=(('points'), pdim1))) + self.assertDatasetIdentical(data.isel(dim1=(('points', ), pdim1), + dim2=(('points', ), pdim2)), + data.isel(dim2=(('points', ), pdim2), + dim1=(('points', ), pdim1))) # make sure we're raising errors in the right places with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mismatch'): - data.isel(dim1=(('points'), [1, 2]), dim2=(('points'), [1, 2, 3])) + data.isel(dim1=(('points', ), [1, 2]), + dim2=(('points', ), [1, 2, 3])) with self.assertRaisesRegexp(TypeError, 'cannot use a Dataset'): data.isel(dim1=Dataset({'points': [1, 2]})) # test to be sure we keep around variables that were not indexed ds = Dataset({'x': [1, 2, 3, 4], 'y': 0}) - actual = ds.isel(x=(('points'), [0, 1, 2])) + actual = ds.isel(x=(('points', ), [0, 1, 2])) self.assertDataArrayIdentical(ds['y'], actual['y']) # tests using index or DataArray as a dim stations = Dataset() - stations['station'] = ('station', ['A', 'B', 'C']) - stations['dim1s'] = ('station', [1, 2, 3]) - stations['dim2s'] = ('station', [4, 5, 1]) + stations['station'] = (('station', ), ['A', 'B', 'C']) + stations['dim1s'] = (('station', ), [1, 2, 3]) + stations['dim2s'] = (('station', ), [4, 5, 1]) actual = data.isel(dim1=stations['dim1s'], dim2=stations['dim2s']) @@ -910,8 +912,8 @@ def test_isel_fancy(self): # multi-dimensional selection stations = Dataset() - stations['a'] = ('a', ['A', 'B', 'C']) - stations['b'] = ('b', [0, 1]) + stations['a'] = (('a', ), ['A', 'B', 'C']) + stations['b'] = (('b', ), [0, 1]) stations['dim1s'] = (('a', 'b'), [[1, 2], [2, 3], [3, 4]]) stations['dim2s'] = (('a', ), [4, 5, 1]) @@ -1015,22 +1017,9 @@ def test_isel_points(self): data.isel_points(dim2=pdim2, dim1=pdim1)) # make sure we're raising errors in the right places - with self.assertRaisesRegexp(ValueError, - 'All indexers must be the same length'): + with self.assertRaisesRegexp(IndexError, + 'Dimensions of indexers mismatch'): data.isel_points(dim1=[1, 2], dim2=[1, 2, 3]) - with self.assertRaisesRegexp(ValueError, - 'dimension bad_key does not exist'): - data.isel_points(bad_key=[1, 2]) - with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): - data.isel_points(dim1=[1.5, 2.2]) - with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): - data.isel_points(dim1=[1, 2, 3], dim2=slice(3)) - with self.assertRaisesRegexp(ValueError, - 'Indexers must be 1 dimensional'): - data.isel_points(dim1=1, dim2=2) - with self.assertRaisesRegexp(ValueError, - 'Existing dimension names are not valid'): - data.isel_points(dim1=[1, 2], dim2=[1, 2], dim='dim2') # test to be sure we keep around variables that were not indexed ds = Dataset({'x': [1, 2, 3, 4], 'y': 0}) @@ -1085,15 +1074,46 @@ def test_sel_points(self): self.assertDatasetIdentical(expected, actual) data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) - expected = Dataset({'foo': ('points', [0, 4, 8])} - ) + expected = Dataset({'foo': ('points', [0, 4, 8])}) + actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) + self.assertDatasetIdentical(expected, actual) + + data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) + expected.coords.update({'x': ('points', [0, 1, 2]), + 'y': ('points', [0, 1, 2])}) + actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], + method='pad') + self.assertDatasetIdentical(expected, actual) + + if pd.__version__ >= '0.17': + with self.assertRaises(KeyError): + data.sel_points(x=[2.5], y=[2.0], method='pad', tolerance=1e-3) + + def test_sel_fancy(self): + data = create_test_data() + + # add in a range() index + data['dim1'] = data.dim1 + + pdim1 = [1, 2, 3] + pdim2 = [4, 5, 1] + pdim3 = [1, 2, 3] + expected = data.isel(dim1=(('test_coord', ), pdim1), + dim2=(('test_coord', ), pdim2), + dim3=(('test_coord'), pdim3)) + actual = data.sel(dim1=(('test_coord', ), data.dim1[pdim1]), + dim2=(('test_coord', ), data.dim2[pdim2]), + dim3=(('test_coord', ), data.dim3[pdim3])) + self.assertDatasetIdentical(expected, actual) + + data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) + expected = Dataset({'foo': ('points', [0, 4, 8])}) actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) self.assertDatasetIdentical(expected, actual) data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) expected.coords.update({'x': ('points', [0, 1, 2]), - 'y': ('points', [0, 1, 2]) - }) + 'y': ('points', [0, 1, 2])}) actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], method='pad') self.assertDatasetIdentical(expected, actual) From d133766446eeab5e5e997b824bb079bc5c374b0f Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 6 Aug 2017 20:30:01 +0900 Subject: [PATCH 053/113] Added more tests --- xarray/core/dataset.py | 7 +++ xarray/tests/test_dataarray.py | 101 +++++++++++++++++++++++++------- xarray/tests/test_dataset.py | 102 ++++++++++++++++++++++++++++++--- 3 files changed, 180 insertions(+), 30 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3f9241c831b..8fbc971d266 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1222,6 +1222,9 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): """ from .dataarray import DataArray + new_coords = {k: v._coords for k, v in indexers.items() + if isinstance(v, DataArray)} + for k, v in indexers.items(): if isinstance(v, tuple): if (k in self.indexes and @@ -1243,6 +1246,10 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) + # attach indexer's coordinate to pos_indexers + for k, v in new_coords.items(): + pos_indexers[k] = DataArray(pos_indexers[k], dims=v.keys(), + coords=v) result = self.isel(drop=drop, **pos_indexers) return result._replace_indexes(new_indexes) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 08a65fefbcb..1094c56f6f4 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -507,6 +507,81 @@ def test_isel(self): self.assertDataArrayIdentical(self.dv[:3, :5], self.dv.isel(x=slice(3), y=slice(5))) + def test_isel_fancy(self): + shape = (10, 7, 6) + np_array = np.random.random(shape) + da = DataArray(np_array, dims=['time', 'y', 'x'], + coords={'time': np.arange(0, 100, 10)}) + y = [1, 3] + x = [3, 0] + + expected = da.values[:, y, x] + + actual = da.isel(y=(('test_coord', ), y), x=(('test_coord', ), x)) + assert actual.coords['test_coord'].shape == (len(y), ) + assert list(actual.coords) == ['time'] + assert actual.dims == ('time', 'test_coord') + + np.testing.assert_equal(actual, expected) + + # a few corner cases + da.isel(time=(('points',), [1, 2]), x=(('points',), [2, 2]), + y=(('points',), [3, 4])) + np.testing.assert_allclose( + da.isel_points(time=[1], x=[2], y=[4]).values.squeeze(), + np_array[1, 4, 2].squeeze()) + da.isel(time=(('points', ), [1, 2])) + y = [-1, 0] + x = [-2, 2] + expected = da.values[:, y, x] + actual = da.isel(x=(('points', ), x), y=(('points', ), y)).values + np.testing.assert_equal(actual, expected) + + # test that the order of the indexers doesn't matter + self.assertDataArrayIdentical( + da.isel(y=(('points', ), y), x=(('points', ), x)), + da.isel(x=(('points', ), x), y=(('points', ), y))) + + # make sure we're raising errors in the right places + with self.assertRaisesRegexp(IndexError, + 'Dimensions of indexers mismatch'): + da.isel(y=(('points', ), [1, 2]), x=(('points', ), [1, 2, 3])) + + # tests using index or DataArray as indexers + stations = Dataset() + stations['station'] = (('station', ), ['A', 'B', 'C']) + stations['dim1s'] = (('station', ), [1, 2, 3]) + stations['dim2s'] = (('station', ), [4, 5, 1]) + + actual = da.isel(x=stations['dim1s'], y=stations['dim2s']) + assert 'station' in actual.coords + assert 'station' in actual.dims + self.assertDataArrayIdentical(actual['station'], stations['station']) + + with self.assertRaisesRegexp(ValueError, 'conflicting values for '): + da.isel(x=DataArray([0, 1, 2], dims='station', + coords={'station': [0, 1, 2]}), + y=DataArray([0, 1, 2], dims='station', + coords={'station': [0, 1, 3]})) + + # multi-dimensional selection + stations = Dataset() + stations['a'] = (('a', ), ['A', 'B', 'C']) + stations['b'] = (('b', ), [0, 1]) + stations['dim1s'] = (('a', 'b'), [[1, 2], [2, 3], [3, 4]]) + stations['dim2s'] = (('a', ), [4, 5, 1]) + + actual = da.isel(x=stations['dim1s'], y=stations['dim2s']) + assert 'a' in actual.coords + assert 'a' in actual.dims + assert 'b' in actual.coords + assert 'b' in actual.dims + self.assertDataArrayIdentical(actual['a'], stations['a']) + self.assertDataArrayIdentical(actual['b'], stations['b']) + expected = da.variable[:, stations['dim2s'].variable, + stations['dim1s'].variable] + self.assertArrayEqual(actual, expected) + def test_sel(self): self.ds['x'] = ('x', np.array(list('abcdefghij'))) da = self.ds['foo'] @@ -582,14 +657,11 @@ def test_isel_points(self): actual = da.isel_points(y=y, x=x, dim='test_coord') assert actual.coords['test_coord'].shape == (len(y), ) assert list(actual.coords) == ['time'] - assert actual.dims == ('test_coord', 'time') + assert actual.dims == ('time', 'test_coord') actual = da.isel_points(y=y, x=x) assert 'points' in actual.dims - # Note that because xarray always concatenates along the first - # dimension, We must transpose the result to match the numpy style of - # concatenation. - np.testing.assert_equal(actual.T, expected) + np.testing.assert_equal(actual, expected) # a few corner cases da.isel_points(time=[1, 2], x=[2, 2], y=[3, 4]) @@ -601,7 +673,7 @@ def test_isel_points(self): x = [-2, 2] expected = da.values[:, y, x] actual = da.isel_points(x=x, y=y).values - np.testing.assert_equal(actual.T, expected) + np.testing.assert_equal(actual, expected) # test that the order of the indexers doesn't matter self.assertDataArrayIdentical( @@ -609,22 +681,9 @@ def test_isel_points(self): da.isel_points(x=x, y=y)) # make sure we're raising errors in the right places - with self.assertRaisesRegexp(ValueError, - 'All indexers must be the same length'): + with self.assertRaisesRegexp(IndexError, + 'Dimensions of indexers mismatch'): da.isel_points(y=[1, 2], x=[1, 2, 3]) - with self.assertRaisesRegexp(ValueError, - 'dimension bad_key does not exist'): - da.isel_points(bad_key=[1, 2]) - with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): - da.isel_points(y=[1.5, 2.2]) - with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): - da.isel_points(x=[1, 2, 3], y=slice(3)) - with self.assertRaisesRegexp(ValueError, - 'Indexers must be 1 dimensional'): - da.isel_points(y=1, x=2) - with self.assertRaisesRegexp(ValueError, - 'Existing dimension names are not'): - da.isel_points(y=[1, 2], x=[1, 2], dim='x') # using non string dims actual = da.isel_points(y=[1, 2], x=[1, 2], dim=['A', 'B']) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7e7f4eaa9d9..2bc2070560e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -865,6 +865,44 @@ def test_isel_fancy(self): dim3=(('test_coord', ), pdim3)) assert 'test_coord' in actual.dims assert actual.coords['test_coord'].shape == (len(pdim1), ) + expected = data.isel(dim1=(('test_coord', ), pdim1), + dim2=(('test_coord', ), pdim2), + dim3=(('test_coord', ), pdim3)) + self.assertDatasetIdentical(actual, expected) + + # DataArray with coordinate + idx1 = DataArray(pdim1, dims=['a'], coords={'a': np.random.randn(3)}) + idx2 = DataArray(pdim2, dims=['b'], coords={'b': np.random.randn(3)}) + idx3 = DataArray(pdim3, dims=['c'], coords={'c': np.random.randn(3)}) + # Should work with DataArray + actual = data.isel(dim1=idx1, dim2=idx2, dim3=idx3) + assert 'a' in actual.dims + assert 'b' in actual.dims + assert 'c' in actual.dims + assert 'time' in actual.coords + assert 'dim2' in actual.coords + assert 'dim3' in actual.coords + expected = data.isel(dim1=(('a', ), pdim1), + dim2=(('b', ), pdim2), + dim3=(('c', ), pdim3)) + expected = expected.assign_coords(a=idx1['a'], b=idx2['b'], + c=idx3['c']) + self.assertDatasetIdentical(actual, expected) + + idx1 = DataArray(pdim1, dims=['a'], coords={'a': np.random.randn(3)}) + idx2 = DataArray(pdim2, dims=['a']) + idx3 = DataArray(pdim3, dims=['a']) + # Should work with DataArray + actual = data.isel(dim1=idx1, dim2=idx2, dim3=idx3) + assert 'a' in actual.dims + assert 'time' in actual.coords + assert 'dim2' in actual.coords + assert 'dim3' in actual.coords + expected = data.isel(dim1=(('a', ), pdim1), + dim2=(('a', ), pdim2), + dim3=(('a', ), pdim3)) + expected = expected.assign_coords(a=idx1['a']) + self.assertDatasetIdentical(actual, expected) actual = data.isel(dim1=(('points', ), pdim1), dim2=(('points', ), pdim2)) @@ -891,7 +929,7 @@ def test_isel_fancy(self): actual = ds.isel(x=(('points', ), [0, 1, 2])) self.assertDataArrayIdentical(ds['y'], actual['y']) - # tests using index or DataArray as a dim + # tests using index or DataArray as indexers stations = Dataset() stations['station'] = (('station', ), ['A', 'B', 'C']) stations['dim1s'] = (('station', ), [1, 2, 3]) @@ -916,13 +954,14 @@ def test_isel_fancy(self): stations['b'] = (('b', ), [0, 1]) stations['dim1s'] = (('a', 'b'), [[1, 2], [2, 3], [3, 4]]) stations['dim2s'] = (('a', ), [4, 5, 1]) - - actual = data.isel(dim1=stations['dim1s'], - dim2=stations['dim2s']) + actual = data.isel(dim1=stations['dim1s'], dim2=stations['dim2s']) assert 'a' in actual.coords assert 'a' in actual.dims assert 'b' in actual.coords assert 'b' in actual.dims + assert 'dim2' in actual.coords + assert 'a' in actual['dim2'].dims + self.assertDataArrayIdentical(actual['a'].drop(['dim2']), stations['a']) self.assertDataArrayIdentical(actual['b'], stations['b']) @@ -932,6 +971,7 @@ def test_isel_fancy(self): stations['dim2s'].variable] expected_var3 = data['var3'].variable[slice(None), stations['dim1s'].variable] + self.assertDataArrayEqual(actual['a'].drop('dim2'), stations['a']) self.assertArrayEqual(actual['var1'], expected_var1) self.assertArrayEqual(actual['var2'], expected_var2) self.assertArrayEqual(actual['var3'], expected_var3) @@ -1106,18 +1146,62 @@ def test_sel_fancy(self): dim3=(('test_coord', ), data.dim3[pdim3])) self.assertDatasetIdentical(expected, actual) + # DataArray Indexer + idx_t = DataArray(data['time'][[3, 2, 1]].values, dims=['a'], + coords={'a': ['a', 'b', 'c']}) + idx_2 = DataArray(data['dim2'][[3, 2, 1]].values, dims=['a'], + coords={'a': ['a', 'b', 'c']}) + idx_3 = DataArray(data['dim3'][[3, 2, 1]].values, dims=['a'], + coords={'a': ['a', 'b', 'c']}) + actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) + expected = data.isel(time=(('a', ), [3, 2, 1]), + dim2=(('a', ), [3, 2, 1]), + dim3=(('a', ), [3, 2, 1])) + expected = expected.assign_coords(a=idx_t['a']) + self.assertDatasetIdentical(expected, actual) + + idx_t = DataArray(data['time'][[3, 2, 1]].values, dims=['a'], + coords={'a': ['a', 'b', 'c']}) + idx_2 = DataArray(data['dim2'][[2, 1, 3]].values, dims=['b'], + coords={'b': [0, 1, 2]}) + idx_3 = DataArray(data['dim3'][[1, 2, 1]].values, dims=['c'], + coords={'c': [0.0, 1.1, 2.2]}) + actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) + expected = data.isel(time=(('a', ), [3, 2, 1]), + dim2=(('b', ), [2, 1, 3]), + dim3=(('c', ), [1, 2, 1])) + expected = expected.assign_coords(a=idx_t['a'], b=idx_2['b'], + c=idx_3['c']) + self.assertDatasetIdentical(expected, actual) + + # Multi Dimensional indexers + #data.sel(x=[]) + + # test from sel_points data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) - expected = Dataset({'foo': ('points', [0, 4, 8])}) - actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) + data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) + + expected = Dataset({'foo': ('points', [0, 4, 8])}, + coords={'x': (('points', ), [0, 1, 2]), + 'y': (('points', ), [0, 1, 2])}) + actual = data.sel(x=(('points', ), [0, 1, 2]), + y=(('points', ), [0, 1, 2])) self.assertDatasetIdentical(expected, actual) - data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) expected.coords.update({'x': ('points', [0, 1, 2]), 'y': ('points', [0, 1, 2])}) - actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], - method='pad') + actual = data.sel(x=(('points', ), [0.1, 1.1, 2.5]), + y=(('points', ), [0, 1.2, 2.0]), method='pad') self.assertDatasetIdentical(expected, actual) + idx_x = DataArray([0, 1, 2], dims=['a'], coords={'a': ['a', 'b', 'c']}) + idx_y = DataArray([0, 2, 1], dims=['b'], coords={'b': [0, 3, 6]}) + expected_ary = data['foo'][[0, 1, 2], [0, 2, 1]] + actual = data.sel(x=idx_x, y=idx_y) + self.assertArrayEqual(expected_ary, actual['foo']) + self.assertDataArrayIdentical(actual['a'].drop('x'), idx_x['a']) + self.assertDataArrayIdentical(actual['b'].drop('y'), idx_y['b']) + if pd.__version__ >= '0.17': with self.assertRaises(KeyError): data.sel_points(x=[2.5], y=[2.0], method='pad', tolerance=1e-3) From 631f6e97939296877e2a24391572fa5d920871b8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Aug 2017 20:48:52 -0700 Subject: [PATCH 054/113] Changes per review --- xarray/core/indexing.py | 2 +- xarray/core/nputils.py | 2 +- xarray/core/variable.py | 2 +- xarray/tests/test_indexing.py | 2 +- xarray/tests/test_nputils.py | 4 ++-- xarray/tests/test_variable.py | 6 ++++++ 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2a75653a863..06f812ef3de 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -477,7 +477,7 @@ def _indexing_array_and_key(self, key): key = _outer_to_numpy_indexer(key, self.array.shape) if isinstance(key, VectorizedIndexer): - array = nputils.VectorizedIndex(self.array) + array = nputils.NumpyVIndexAdapter(self.array) else: array = self.array diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 3c4982ac0bf..4f87608f42c 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -116,7 +116,7 @@ def _advanced_indexer_subspaces(key): return mixed_positions, vindex_positions -class VectorizedIndex(object): +class NumpyVIndexAdapter(object): """Object that implements indexing like vindex on a np.ndarray. This is a pure Python implementation of (some of) the logic in this NumPy diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e84b9461733..0b26be1583f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -508,7 +508,7 @@ def _broadcast_indexes_vectorized(self, key): # result. This is significantly faster/more efficient for # most array backends. values = np.arange(*value.indices(self.sizes[dim])) - variables.insert(i, Variable((dim,), values)) + variables.insert(i - len(slices), Variable((dim,), values)) else: slices.append((i, value)) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index ed80ca7aad7..34a0060309d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -245,7 +245,7 @@ def nonzero(x): for i, j, k in itertools.product(indexers, repeat=3): _, expected, new_order = v._broadcast_indexes_vectorized((i, j, k)) - expected_data = nputils.VectorizedIndex(v.data)[expected] + expected_data = nputils.NumpyVIndexAdapter(v.data)[expected] if new_order: old_order = range(len(new_order)) expected_data = moveaxis(expected_data, old_order, new_order) diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 062885ae155..83445e4639f 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -1,7 +1,7 @@ import numpy as np from numpy.testing import assert_array_equal -from xarray.core.nputils import _is_contiguous, VectorizedIndex +from xarray.core.nputils import _is_contiguous, NumpyVIndexAdapter def test_is_contiguous(): @@ -12,7 +12,7 @@ def test_is_contiguous(): def test_vindex(): x = np.arange(3 * 4 * 5).reshape((3, 4, 5)) - vindex = VectorizedIndex(x) + vindex = NumpyVIndexAdapter(x) # getitem assert_array_equal(vindex[0], x[0]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f0fdc71b483..a75e917e47e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -584,6 +584,12 @@ def test_getitem_fancy(self): expected = Variable(['a', 'b', 'y', 'z'], v.data[np.newaxis, ...]) self.assertVariableIdentical(v_new, expected) + v = Variable(['w', 'x', 'y', 'z'], [[[[1, 2, 3], [4, 5, 6]]]]) + ind = Variable(['y'], [0]) + v_new = v[ind, :, 1:2, 2] + expected = Variable(['y', 'x'], [[6]]) + self.assertVariableIdentical(v_new, expected) + def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) From 3231445cca6903a1b6d961b7566224039026c40b Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 7 Aug 2017 20:58:32 +0900 Subject: [PATCH 055/113] Restore `isel_points`. Remove automatic tuple conversion for `sel` --- xarray/core/dataset.py | 131 +++++++++++++++++++++++---------- xarray/tests/test_dataarray.py | 26 +++++-- xarray/tests/test_dataset.py | 53 ++++++++----- 3 files changed, 146 insertions(+), 64 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8fbc971d266..0ff5221100f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1225,23 +1225,8 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): new_coords = {k: v._coords for k, v in indexers.items() if isinstance(v, DataArray)} - for k, v in indexers.items(): - if isinstance(v, tuple): - if (k in self.indexes and - (isinstance(self.indexes[k].data, pd.MultiIndex) or - self.indexes[k].dtype == 'object')): - # If array dtype is tuple, we should be carefully check - # whether indexer should be Variable or not. - try: - v_tmp = as_variable(v) - # TODO should check dtype consistency - indexers[k] = v_tmp - except (ValueError, TypeError): - pass - else: - indexers[k] = as_variable(v) - elif isinstance(v, DataArray): - indexers[k] = v.variable + indexers = {k: v.variable if isinstance(v, DataArray) else v + for k, v in indexers.items()} pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance @@ -1291,30 +1276,98 @@ def isel_points(self, dim='points', **indexers): Dataset.sel_points DataArray.isel_points """ + indexer_dims = set(indexers) - import warnings - warnings.warn('Dataset.isel_points is deprecated: use Dataset.isel()' - 'instead', DeprecationWarning, stacklevel=2) + def take(variable, slices): + # Note: remove helper function when once when numpy + # supports vindex https://github.com/numpy/numpy/pull/6075 + if hasattr(variable.data, 'vindex'): + # Special case for dask backed arrays to use vectorised list indexing + sel = variable.data.vindex[slices] + else: + # Otherwise assume backend is numpy array with 'fancy' indexing + sel = variable.data[slices] + return sel + + def relevant_keys(mapping): + return [k for k, v in mapping.items() + if any(d in indexer_dims for d in v.dims)] + + coords = relevant_keys(self.coords) + indexers = [(k, np.asarray(v)) for k, v in iteritems(indexers)] + indexers_dict = dict(indexers) + non_indexed_dims = set(self.dims) - indexer_dims + non_indexed_coords = set(self.coords) - set(coords) + + # All the indexers should be iterables + # Check that indexers are valid dims, integers, and 1D + for k, v in indexers: + if k not in self.dims: + raise ValueError("dimension %s does not exist" % k) + if v.dtype.kind != 'i': + raise TypeError('Indexers must be integers') + if v.ndim != 1: + raise ValueError('Indexers must be 1 dimensional') + + # all the indexers should have the same length + lengths = set(len(v) for k, v in indexers) + if len(lengths) > 1: + raise ValueError('All indexers must be the same length') + + # Existing dimensions are not valid choices for the dim argument + if isinstance(dim, basestring): + if dim in self.dims: + # dim is an invalid string + raise ValueError('Existing dimension names are not valid ' + 'choices for the dim argument in sel_points') + + elif hasattr(dim, 'dims'): + # dim is a DataArray or Coordinate + if dim.name in self.dims: + # dim already exists + raise ValueError('Existing dimensions are not valid choices ' + 'for the dim argument in sel_points') + + # Set the new dim_name, and optionally the new dim coordinate + # dim is either an array-like or a string + if not utils.is_scalar(dim): + # dim is array like get name or assign 'points', get as variable + dim_name = 'points' if not hasattr(dim, 'name') else dim.name + dim_coord = as_variable(dim, name=dim_name) + else: + # dim is a string + dim_name = dim + dim_coord = None - from .dataarray import DataArray - if isinstance(dim, DataArray): - indexers = {k: DataArray(v, dims=[dim.name], coords=dim.coords) - for k, v in iteritems(indexers)} - return self.isel(**indexers) - - if isinstance(dim, (list, np.ndarray)): - indexers = {k: DataArray(v, dims=['points'], - coords={'points': dim}) - for k, v in iteritems(indexers)} - return self.isel(**indexers) - - if isinstance(dim, pd.Index): - indexers = {k: DataArray(v, dims=[dim.name, ], - coords={dim.name: dim}) - for k, v in iteritems(indexers)} - return self.isel(**indexers) - - return self.isel(**{k: ((dim, ), v) for k, v in iteritems(indexers)}) + reordered = self.transpose(*(list(indexer_dims) + list(non_indexed_dims))) + + variables = OrderedDict() + + for name, var in reordered.variables.items(): + if name in indexers_dict or any(d in indexer_dims for d in var.dims): + # slice if var is an indexer or depends on an indexed dim + slc = [indexers_dict[k] + if k in indexers_dict + else slice(None) for k in var.dims] + + var_dims = [dim_name] + [d for d in var.dims + if d in non_indexed_dims] + selection = take(var, tuple(slc)) + var_subset = type(var)(var_dims, selection, var.attrs) + variables[name] = var_subset + else: + # If not indexed just add it back to variables or coordinates + variables[name] = var + + coord_names = (set(coords) & set(variables)) | non_indexed_coords + + dset = self._replace_vars_and_dims(variables, coord_names=coord_names) + # Add the dim coord to the new dset. Must be done after creation + # because_replace_vars_and_dims can only access existing coords, + # not add new ones + if dim_coord is not None: + dset.coords[dim_name] = dim_coord + return dset def sel_points(self, dim='points', method=None, tolerance=None, **indexers): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 1094c56f6f4..fd6e057f705 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -657,11 +657,14 @@ def test_isel_points(self): actual = da.isel_points(y=y, x=x, dim='test_coord') assert actual.coords['test_coord'].shape == (len(y), ) assert list(actual.coords) == ['time'] - assert actual.dims == ('time', 'test_coord') + assert actual.dims == ('test_coord', 'time') actual = da.isel_points(y=y, x=x) assert 'points' in actual.dims - np.testing.assert_equal(actual, expected) + # Note that because xarray always concatenates along the first + # dimension, We must transpose the result to match the numpy style of + # concatenation. + np.testing.assert_equal(actual.T, expected) # a few corner cases da.isel_points(time=[1, 2], x=[2, 2], y=[3, 4]) @@ -673,7 +676,7 @@ def test_isel_points(self): x = [-2, 2] expected = da.values[:, y, x] actual = da.isel_points(x=x, y=y).values - np.testing.assert_equal(actual, expected) + np.testing.assert_equal(actual.T, expected) # test that the order of the indexers doesn't matter self.assertDataArrayIdentical( @@ -681,9 +684,22 @@ def test_isel_points(self): da.isel_points(x=x, y=y)) # make sure we're raising errors in the right places - with self.assertRaisesRegexp(IndexError, - 'Dimensions of indexers mismatch'): + with self.assertRaisesRegexp(ValueError, + 'All indexers must be the same length'): da.isel_points(y=[1, 2], x=[1, 2, 3]) + with self.assertRaisesRegexp(ValueError, + 'dimension bad_key does not exist'): + da.isel_points(bad_key=[1, 2]) + with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): + da.isel_points(y=[1.5, 2.2]) + with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): + da.isel_points(x=[1, 2, 3], y=slice(3)) + with self.assertRaisesRegexp(ValueError, + 'Indexers must be 1 dimensional'): + da.isel_points(y=1, x=2) + with self.assertRaisesRegexp(ValueError, + 'Existing dimension names are not'): + da.isel_points(y=[1, 2], x=[1, 2], dim='x') # using non string dims actual = da.isel_points(y=[1, 2], x=[1, 2], dim=['A', 'B']) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2bc2070560e..2716f432527 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1057,9 +1057,22 @@ def test_isel_points(self): data.isel_points(dim2=pdim2, dim1=pdim1)) # make sure we're raising errors in the right places - with self.assertRaisesRegexp(IndexError, - 'Dimensions of indexers mismatch'): + with self.assertRaisesRegexp(ValueError, + 'All indexers must be the same length'): data.isel_points(dim1=[1, 2], dim2=[1, 2, 3]) + with self.assertRaisesRegexp(ValueError, + 'dimension bad_key does not exist'): + data.isel_points(bad_key=[1, 2]) + with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): + data.isel_points(dim1=[1.5, 2.2]) + with self.assertRaisesRegexp(TypeError, 'Indexers must be integers'): + data.isel_points(dim1=[1, 2, 3], dim2=slice(3)) + with self.assertRaisesRegexp(ValueError, + 'Indexers must be 1 dimensional'): + data.isel_points(dim1=1, dim2=2) + with self.assertRaisesRegexp(ValueError, + 'Existing dimension names are not valid'): + data.isel_points(dim1=[1, 2], dim2=[1, 2], dim='dim2') # test to be sure we keep around variables that were not indexed ds = Dataset({'x': [1, 2, 3, 4], 'y': 0}) @@ -1138,12 +1151,12 @@ def test_sel_fancy(self): pdim1 = [1, 2, 3] pdim2 = [4, 5, 1] pdim3 = [1, 2, 3] - expected = data.isel(dim1=(('test_coord', ), pdim1), - dim2=(('test_coord', ), pdim2), - dim3=(('test_coord'), pdim3)) - actual = data.sel(dim1=(('test_coord', ), data.dim1[pdim1]), - dim2=(('test_coord', ), data.dim2[pdim2]), - dim3=(('test_coord', ), data.dim3[pdim3])) + expected = data.isel(dim1=Variable(('test_coord', ), pdim1), + dim2=Variable(('test_coord', ), pdim2), + dim3=Variable(('test_coord'), pdim3)) + actual = data.sel(dim1=Variable(('test_coord', ), data.dim1[pdim1]), + dim2=Variable(('test_coord', ), data.dim2[pdim2]), + dim3=Variable(('test_coord', ), data.dim3[pdim3])) self.assertDatasetIdentical(expected, actual) # DataArray Indexer @@ -1154,9 +1167,9 @@ def test_sel_fancy(self): idx_3 = DataArray(data['dim3'][[3, 2, 1]].values, dims=['a'], coords={'a': ['a', 'b', 'c']}) actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) - expected = data.isel(time=(('a', ), [3, 2, 1]), - dim2=(('a', ), [3, 2, 1]), - dim3=(('a', ), [3, 2, 1])) + expected = data.isel(time=Variable(('a', ), [3, 2, 1]), + dim2=Variable(('a', ), [3, 2, 1]), + dim3=Variable(('a', ), [3, 2, 1])) expected = expected.assign_coords(a=idx_t['a']) self.assertDatasetIdentical(expected, actual) @@ -1167,9 +1180,9 @@ def test_sel_fancy(self): idx_3 = DataArray(data['dim3'][[1, 2, 1]].values, dims=['c'], coords={'c': [0.0, 1.1, 2.2]}) actual = data.sel(time=idx_t, dim2=idx_2, dim3=idx_3) - expected = data.isel(time=(('a', ), [3, 2, 1]), - dim2=(('b', ), [2, 1, 3]), - dim3=(('c', ), [1, 2, 1])) + expected = data.isel(time=Variable(('a', ), [3, 2, 1]), + dim2=Variable(('b', ), [2, 1, 3]), + dim3=Variable(('c', ), [1, 2, 1])) expected = expected.assign_coords(a=idx_t['a'], b=idx_2['b'], c=idx_3['c']) self.assertDatasetIdentical(expected, actual) @@ -1182,16 +1195,16 @@ def test_sel_fancy(self): data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) expected = Dataset({'foo': ('points', [0, 4, 8])}, - coords={'x': (('points', ), [0, 1, 2]), - 'y': (('points', ), [0, 1, 2])}) - actual = data.sel(x=(('points', ), [0, 1, 2]), - y=(('points', ), [0, 1, 2])) + coords={'x': Variable(('points', ), [0, 1, 2]), + 'y': Variable(('points', ), [0, 1, 2])}) + actual = data.sel(x=Variable(('points', ), [0, 1, 2]), + y=Variable(('points', ), [0, 1, 2])) self.assertDatasetIdentical(expected, actual) expected.coords.update({'x': ('points', [0, 1, 2]), 'y': ('points', [0, 1, 2])}) - actual = data.sel(x=(('points', ), [0.1, 1.1, 2.5]), - y=(('points', ), [0, 1.2, 2.0]), method='pad') + actual = data.sel(x=Variable(('points', ), [0.1, 1.1, 2.5]), + y=Variable(('points', ), [0, 1.2, 2.0]), method='pad') self.assertDatasetIdentical(expected, actual) idx_x = DataArray([0, 1, 2], dims=['a'], coords={'a': ['a', 'b', 'c']}) From dd325c5e39b296b93c044bce7bc826b774926035 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 7 Aug 2017 21:46:27 +0900 Subject: [PATCH 056/113] Some clean up --- xarray/core/formatting.py | 4 ++-- xarray/core/indexing.py | 5 ----- xarray/core/variable.py | 13 ++++++------- xarray/tests/test_variable.py | 2 +- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 9b21691a5b6..2e7f0801bfb 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -21,7 +21,7 @@ from .options import OPTIONS from .pycompat import PY2, unicode_type, bytes_type, dask_array_type -from .indexing import IndexerTuple +from .indexing import BasicIndexer def pretty_print(x, numchars): @@ -69,7 +69,7 @@ def _get_indexer_at_least_n_items(shape, n_desired): cum_items = np.cumprod(shape[::-1]) n_steps = np.argmax(cum_items >= n_desired) stop = int(np.ceil(float(n_desired) / np.r_[1, cum_items][n_steps])) - indexer = IndexerTuple((0, ) * (len(shape) - 1 - n_steps) + (slice(stop), ) + indexer = BasicIndexer((0, ) * (len(shape) - 1 - n_steps) + (slice(stop), ) + (slice(None), ) * n_steps) return indexer diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a2c66e42af1..3031c67e545 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -8,7 +8,6 @@ from . import nputils from . import utils -from .npcompat import moveaxis from .pycompat import (iteritems, range, integer_types, dask_array_type, suppress) from .utils import is_dict_like @@ -303,10 +302,6 @@ class VectorizedIndexer(IndexerTuple): """ Tuple for vectorized indexing """ -class PointwiseIndexer(IndexerTuple): - """ Tuple for pointwise indexing with dask.array's vindex """ - - class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy """ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0b26be1583f..66dfe4ea665 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -20,7 +20,7 @@ from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) from .indexing import (PandasIndexAdapter, xarray_indexable, BasicIndexer, - OuterIndexer, PointwiseIndexer, VectorizedIndexer) + OuterIndexer, VectorizedIndexer) from .utils import OrderedSet import xarray as xr # only for Dataset and DataArray @@ -417,7 +417,7 @@ def _broadcast_indexes(self, key): return self._broadcast_indexes_basic(key) # Detect it can be mapped as an outer indexer - # If all key is unlabelled, or + # If all key is unlabeled, or # key can be mapped as an OuterIndexer. if all(not isinstance(k, Variable) for k in key): return self._broadcast_indexes_outer(key) @@ -457,7 +457,7 @@ def _broadcast_indexes_outer(self, key): else: k = np.asarray(k) if k.ndim > 1: - raise IndexError("Unlabelled multi-dimensional array " + raise IndexError("Unlabeled multi-dimensional array " "cannot be used for indexing: {}".format( k)) indexer.append(k if k.dtype.kind != 'b' else np.flatnonzero(k)) @@ -482,7 +482,7 @@ def _broadcast_indexes_vectorized(self, key): try: variable = as_variable(value, name=dim) except MissingDimensionsError: # change to better exception - raise IndexError("Unlabelled multi-dimensional array " + raise IndexError("Unlabeled multi-dimensional array " "cannot be used for indexing.") if variable.dtype.kind == 'b': # boolean indexing case @@ -537,10 +537,9 @@ def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. - # TODO more docstrings. NB. __getitem__ and __setitem__ implement xarray-style indexing, - where if keys are unlabelled arrays, we index the array orthogonally - with them. If keys are labelled array (such as Variables), they are + where if keys are unlabeled arrays, we index the array orthogonally + with them. If keys are labeled array (such as Variables), they are broadcasted with our usual scheme and then the array is indexed with the broadcasted key, like numpy's fancy indexing. diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index a75e917e47e..d5aa6d84910 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -593,7 +593,7 @@ def test_getitem_fancy(self): def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) - with self.assertRaisesRegexp(IndexError, "Unlabelled multi-"): + with self.assertRaisesRegexp(IndexError, "labeled multi-"): v[[[0, 1], [1, 2]]] ind_x = Variable(['a'], [0, 1, 1]) From 434a0048c86eb6aafe6a0a625d634fdad7eb20db Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 9 Aug 2017 10:03:24 +0900 Subject: [PATCH 057/113] Supported indexing by a scalar Variable --- xarray/core/variable.py | 3 +++ xarray/tests/test_variable.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 66dfe4ea665..5ea52c54d73 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -413,6 +413,9 @@ def _broadcast_indexes(self, key): key = self._item_key_to_tuple(key) # key is a tuple # key is a tuple of full size key = indexing.expanded_indexer(key, self.ndim) + # Convert a scalar Variable as an integer + key = tuple([(k.data.item() if isinstance(k, Variable) and k.ndim == 0 + else k) for k in key]) if all(isinstance(k, BASIC_INDEXING_TYPES) for k in key): return self._broadcast_indexes_basic(key) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d5aa6d84910..be4b4ba2472 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -523,6 +523,12 @@ def test_getitem_advanced(self): assert v_new.dims == ('x', 'a') self.assertArrayEqual(v_new, v_data[:, 0:1]) + # with scalar variable + ind = Variable((), 2) + v_new = v[dict(y=ind)] + expected = v[dict(y=2)] + self.assertArrayEqual(v_new, expected) + def test_getitem_fancy(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) v_data = v.compute().data From d518f7acd78423dd6596e0748f44ee8aac10e76e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 9 Aug 2017 10:03:50 +0900 Subject: [PATCH 058/113] Supported the indexing by DataArray with coordinates. --- xarray/core/dataset.py | 43 ++++++++++++++++++++++----- xarray/tests/test_dataset.py | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0ff5221100f..68481865da8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1094,6 +1094,33 @@ def maybe_chunk(name, var, chunks): for k, v in self.variables.items()]) return self._replace_vars_and_dims(variables) + def _get_indexers_coordinates(self, indexers): + """ Extract coordinates from indexers. + Returns an OrderedDict mapping from coordinate name to the + coordinate variable. + + Coordinates to be extracted and attached should satisfy + + dimension coordinate of the indexers + + does not have a different name from sef.variables + """ + from .dataarray import DataArray + + coord_list = [] + for k, v in indexers.items(): + if isinstance(v, DataArray): + coords = {d: v.coords[d].variable for d in v.dims + if d in v.coords and d not in self.variables} + if v.dtype.kind == 'b' and v.dims[0] in coords: + # Make sure in case of boolean DataArray, its + # coordinate should be also indexed. + assert v.ndim == 1 # we only support 1-d boolean array + coords[v.dims[0]] = coords[v.dims[0]][v.variable] + coord_list.append(coords) + + # we don't need to call align() explicitly, because merge_variables + # already checks for exact alignment between dimension coordinates + return merge_variables(coord_list) + def isel(self, drop=False, **indexers): """Returns a new dataset with each array indexed along the specified dimension(s). @@ -1134,13 +1161,6 @@ def isel(self, drop=False, **indexers): if invalid: raise ValueError("dimensions %r do not exist" % invalid) - # extract new coordinates from indexers - # we don't need to call align() explicitly, because merge_variables - # already checks for exact alignment between dimension coordinates - variables = merge_variables([v._coords for v in indexers.values() - if isinstance(v, DataArray)]) - coord_names = set(self._coord_names) | set(variables) - # all indexers should be int, slice, np.ndarrays, or Variable indexers_list = [] for k, v in iteritems(indexers): @@ -1156,11 +1176,20 @@ def isel(self, drop=False, **indexers): v = np.asarray(v) indexers_list.append((k, v)) + coord_vars = self._get_indexers_coordinates(indexers) + coord_names = set(self._coord_names) | set(coord_vars) + + variables = OrderedDict() for name, var in iteritems(self._variables): var_indexers = {k: v for k, v in indexers_list if k in var.dims} new_var = var.isel(**var_indexers) if not (drop and name in var_indexers): variables[name] = new_var + + # attatch / overwrite coordinate in indexers + for k, v in coord_vars.items(): + variables[k] = v + coord_names = coord_names & set(variables) return self._replace_vars_and_dims(variables, coord_names=coord_names) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2716f432527..5a4a41ac590 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -976,6 +976,63 @@ def test_isel_fancy(self): self.assertArrayEqual(actual['var2'], expected_var2) self.assertArrayEqual(actual['var3'], expected_var3) + def test_isel_dataarray(self): + """ Test for indexing by DataArray """ + data = create_test_data() + # indexing with DataArray with same-name coordinates. + indexing_da = DataArray(np.arange(1, 4), dims=['dim1'], + coords={'dim1': np.random.randn(3)}) + actual = data.isel(dim1=indexing_da) + self.assertDataArrayIdentical(indexing_da['dim1'], actual['dim1']) + self.assertDataArrayIdentical(data['dim2'], actual['dim2']) + + # not overwrite coordinate + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': np.random.randn(3)}) + actual = data.isel(dim2=indexing_da) + self.assertDataArrayIdentical(actual['dim2'], + data['dim2'].isel(dim2=np.arange(1, 4))) + + # isel for the coordinate. Should not attach the coordinate + actual = data['dim2'].isel(dim2=indexing_da) + self.assertDataArrayIdentical(actual, + data['dim2'].isel(dim2=np.arange(1, 4))) + + # boolean data array with coordinate with the same name + indexing_da = (indexing_da < 3) + actual = data.isel(dim2=indexing_da) + self.assertDataArrayIdentical(actual['dim2'], data['dim2'][:2]) + + # boolean data array with coordinate with the different name + indexing_da = DataArray(np.arange(1, 4), dims=['new_dim'], + coords={'new_dim': np.random.randn(3)}) + actual = data.isel(dim2=indexing_da < 3) + assert 'new_dim' in actual + assert 'new_dim' in actual.coords + self.assertDataArrayIdentical(actual['new_dim'].drop('dim2'), + indexing_da['new_dim'][:2]) + + # non-dimension coordinate will be ignored + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': np.random.randn(3), + 'non_dim': (('dim2', ), + np.random.randn(3))}) + actual = data.isel(dim2=indexing_da) + assert 'non_dim' not in actual + assert 'non_dim' not in actual.coords + + # indexing with DataArray with drop=True + indexing_da = DataArray(np.arange(1, 4), dims=['a'], + coords={'a': np.random.randn(3)}) + actual = data.isel(dim1=indexing_da) + assert 'a' in actual + assert 'dim1' not in actual + + # Index by a scalar DataArray + indexing_da = DataArray(3, dims=[], coords={'station': 2}) + actual = data.isel(dim2=indexing_da) + assert 'station' not in actual + def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), From ba3cc884a040ab8b94700d3e2c208d4d4eda304f Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 9 Aug 2017 20:06:44 +0900 Subject: [PATCH 059/113] Update DataArray.loc and DataArray.sel to use Dataset.loc and Dataset.sel --- xarray/core/dataarray.py | 19 ++++++++++--------- xarray/core/dataset.py | 8 ++++---- xarray/tests/test_dataarray.py | 25 +++++++++++++++++++++++++ xarray/tests/test_dataset.py | 26 ++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8700446295c..84c26c04930 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -103,8 +103,11 @@ def _remap_key(self, key): return indexing.remap_label_indexers(self.data_array, key) def __getitem__(self, key): - pos_indexers, new_indexes = self._remap_key(key) - return self.data_array[pos_indexers]._replace_indexes(new_indexes) + if not utils.is_dict_like(key): + # expand the indexer so we can handle Ellipsis + labels = indexing.expanded_indexer(key, self.data_array.ndim) + key = dict(zip(self.data_array.dims, labels)) + return self.data_array.sel(**key) def __setitem__(self, key, value): pos_indexers, _ = self._remap_key(key) @@ -468,14 +471,14 @@ def __getitem__(self, key): if isinstance(key, basestring): return self._getitem_coord(key) else: - # orthogonal array indexing + # xarray-style array indexing return self.isel(**self._item_key_to_dict(key)) def __setitem__(self, key, value): if isinstance(key, basestring): self.coords[key] = value else: - # orthogonal array indexing + # xarray-style array indexing self.variable[key] = value def __delitem__(self, key): @@ -688,11 +691,9 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): Dataset.sel DataArray.isel """ - pos_indexers, new_indexes = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance - ) - result = self.isel(drop=drop, **pos_indexers) - return result._replace_indexes(new_indexes) + ds = self._to_temp_dataset().sel(drop=drop, method=method, + tolerance=tolerance, **indexers) + return self._from_temp_dataset(ds) def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 68481865da8..2473fb67bcd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1100,8 +1100,9 @@ def _get_indexers_coordinates(self, indexers): coordinate variable. Coordinates to be extracted and attached should satisfy - + dimension coordinate of the indexers - + does not have a different name from sef.variables + + Dimension coordinate of the indexers. + Non-dimension coordinate of the indexers are not attached. + + Only coordinate with a name different from any of sef.variables. """ from .dataarray import DataArray @@ -1262,8 +1263,7 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): ) # attach indexer's coordinate to pos_indexers for k, v in new_coords.items(): - pos_indexers[k] = DataArray(pos_indexers[k], dims=v.keys(), - coords=v) + pos_indexers[k] = DataArray(pos_indexers[k], coords=v) result = self.isel(drop=drop, **pos_indexers) return result._replace_indexes(new_indexes) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fd6e057f705..e8e0bbd0a8c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -595,6 +595,31 @@ def test_sel(self): self.assertDataArrayIdentical(da[1], da.sel(x=b)) self.assertDataArrayIdentical(da[[1]], da.sel(x=slice(b, b))) + def test_sel_dataarray(self): + # indexing with DataArray + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + da = self.ds['foo'] + + ind = DataArray(['a', 'b', 'c'], dims=['x']) + actual = da.sel(x=ind) + self.assertDataArrayIdentical(actual, da.isel(x=[0, 1, 2])) + + # along new dimension + ind = DataArray(['a', 'b', 'c'], dims=['new_dim']) + actual = da.sel(x=ind) + self.assertArrayEqual(actual, da.isel(x=[0, 1, 2])) + assert 'new_dim' in actual.dims + + # with coordinate + ind = DataArray(['a', 'b', 'c'], dims=['new_dim'], + coords={'new_dim': [0, 1, 2]}) + actual = da.sel(x=ind) + self.assertArrayEqual(actual, da.isel(x=[0, 1, 2])) + assert 'new_dim' in actual.dims + assert 'new_dim' in actual.coords + self.assertDataArrayEqual(actual['new_dim'].drop('x'), + ind['new_dim']) + def test_sel_no_index(self): array = DataArray(np.arange(10), dims='x') self.assertDataArrayIdentical(array[0], array.sel(x=0)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5a4a41ac590..0551f7ab3d0 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1067,6 +1067,32 @@ def test_sel(self): self.assertDatasetEqual(data.isel(td=slice(1, 3)), data.sel(td=slice('1 days', '2 days'))) + def test_sel_dataarray(self): + data = create_test_data() + + ind = DataArray([0.0, 0.5, 1.0], dims=['dim2']) + actual = data.sel(dim2=ind) + self.assertDatasetEqual(actual, data.isel(dim2=[0, 1, 2])) + + # with different dimension + ind = DataArray([0.0, 0.5, 1.0], dims=['new_dim']) + actual = data.sel(dim2=ind) + expected = data.isel(dim2=[0, 1, 2]).rename({'dim2': 'new_dim'}) + assert 'new_dim' in actual.dims + self.assertDatasetEqual(actual.drop('dim2'), expected.drop('new_dim')) + + # with coordinate + ind = DataArray([0.0, 0.5, 1.0], dims=['new_dim'], + coords={'new_dim': ['a', 'b', 'c']}) + actual = data.sel(dim2=ind) + expected = data.isel(dim2=[0, 1, 2]).rename({'dim2': 'new_dim'}) + assert 'new_dim' in actual.dims + assert 'new_dim' in actual.coords + self.assertDatasetEqual(actual.drop('new_dim').drop('dim2'), + expected.drop('new_dim')) + self.assertDataArrayEqual(actual['new_dim'].drop('dim2'), + ind['new_dim']) + def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1}) From aa1063597ebe8586a8e3a4f5c617adb66ed05cd6 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 21 Aug 2017 22:32:31 +0900 Subject: [PATCH 060/113] Added inhouse normalize_axis_index --- xarray/core/npcompat.py | 16 ++++++++++++++++ xarray/core/nputils.py | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 499292530af..02c8bdf5447 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -13,6 +13,22 @@ # Code copied from newer versions of NumPy (v1.10 to v1.12). # Used under the terms of NumPy's license, see licenses/NUMPY_LICENSE. + try: + from numpy.core.multiarray import normalize_axis_index + except ImportError: + def normalize_axis_index(axis, ndim, msg_prefix=None): + """ In house version of normalize_axis_index.""" + if axis < -ndim and ndim <= axis: + msg = 'axis {0:d} is out of bounds for array of dimension {1:d}'.format(axis, ndim) + if msg_prefix: + msg = msg_prefix + msg + # Note: original normalize_axis_index raises AxisError + raise IndexError(msg) + + if axis < 0: + return axis + ndim + return axis + def _maybe_view_as_subclass(original_array, new_array): if type(original_array) is not type(new_array): # if input was an ndarray subclass and subclasses were OK, diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 4f87608f42c..41c55a2c5d6 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -110,7 +110,8 @@ def _advanced_indexer_subspaces(key): return (), () non_slices = [k for k in key if not isinstance(k, slice)] - ndim = len(np.broadcast(*non_slices).shape) + ndim = len(np.broadcast(*non_slices).shape if len(non_slices) > 1 + else non_slices[0].shape) mixed_positions = advanced_index_positions[0] + np.arange(ndim) vindex_positions = np.arange(ndim) return mixed_positions, vindex_positions From fd73e82621a631a5d59324df2ea38803ce6b4e7b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 21 Aug 2017 23:32:58 +0900 Subject: [PATCH 061/113] Support an integer key for _advanced_indexer_subspaces --- xarray/core/nputils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 41c55a2c5d6..35fc79285e6 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -110,8 +110,10 @@ def _advanced_indexer_subspaces(key): return (), () non_slices = [k for k in key if not isinstance(k, slice)] - ndim = len(np.broadcast(*non_slices).shape if len(non_slices) > 1 - else non_slices[0].shape) + if len(non_slices) == 1: # older np.broadcast does not support one array + ndim = len(getattr(non_slices[0], 'shape', [])) # should be 0 for int + else: + ndim = len(np.broadcast(*non_slices).shape) mixed_positions = advanced_index_positions[0] + np.arange(ndim) vindex_positions = np.arange(ndim) return mixed_positions, vindex_positions From 6202aff2762b9758788d5a32ba903daf5ed41af8 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 13:32:34 +0900 Subject: [PATCH 062/113] Add warning for coordinate conflict. --- xarray/core/dataset.py | 18 +++++++++++++++++- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ab48246da0e..42bd30b4d2a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1104,19 +1104,35 @@ def _get_indexers_coordinates(self, indexers): + Dimension coordinate of the indexers. Non-dimension coordinate of the indexers are not attached. + Only coordinate with a name different from any of sef.variables. + + If self already has the same name coordinate, we raise an ValueError. """ from .dataarray import DataArray + import warnings coord_list = [] for k, v in indexers.items(): if isinstance(v, DataArray): coords = {d: v.coords[d].variable for d in v.dims - if d in v.coords and d not in self.variables} + if d in v.coords} if v.dtype.kind == 'b' and v.dims[0] in coords: # Make sure in case of boolean DataArray, its # coordinate should be also indexed. assert v.ndim == 1 # we only support 1-d boolean array coords[v.dims[0]] = coords[v.dims[0]][v.variable] + + for k, vc in self.variables.items(): + if k in coords and not vc[v.values].equals(coords[k]): + # TODO raise an Error in the next release + warnings.warn( + "Indexer's coordiante {0:s} conflicts to the " + "exisiting coordinate. This will raise an error " + "in the next release. Use `.drop(\'{0:s}\')` to " + "index without attaching the indexer's " + "coordinate.".format(k), DeprecationWarning, + stacklevel=2) + del coords[k] + coord_list.append(coords) # we don't need to call align() explicitly, because merge_variables diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 0d058f6b219..70747330a62 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -978,6 +978,7 @@ def test_isel_fancy(self): def test_isel_dataarray(self): """ Test for indexing by DataArray """ + import warnings data = create_test_data() # indexing with DataArray with same-name coordinates. indexing_da = DataArray(np.arange(1, 4), dims=['dim1'], @@ -992,12 +993,28 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da) self.assertDataArrayIdentical(actual['dim2'], data['dim2'].isel(dim2=np.arange(1, 4))) + # make sure the coordinate confliction raises a warning + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + actual = data.isel(dim2=indexing_da) + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) + assert "Indexer's coordiante dim2 conflicts" in str(w[-1].message) # isel for the coordinate. Should not attach the coordinate actual = data['dim2'].isel(dim2=indexing_da) self.assertDataArrayIdentical(actual, data['dim2'].isel(dim2=np.arange(1, 4))) + # same name coordinate which does not conflict + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': data['dim2'].values[1:4]}) + self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + actual = data.isel(dim2=indexing_da) + assert len(w) == 0 # no warning + # boolean data array with coordinate with the same name indexing_da = (indexing_da < 3) actual = data.isel(dim2=indexing_da) From f9746fd5f1492136fcd9b47168e040ea63ce0edc Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 15:28:31 +0900 Subject: [PATCH 063/113] Warning changes DeprecationWarning -> FutureWarning. --- xarray/core/dataset.py | 10 +++++----- xarray/tests/test_dataset.py | 4 +--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 42bd30b4d2a..fe27bd619e5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1125,12 +1125,12 @@ def _get_indexers_coordinates(self, indexers): if k in coords and not vc[v.values].equals(coords[k]): # TODO raise an Error in the next release warnings.warn( - "Indexer's coordiante {0:s} conflicts to the " + "Indexer's coordiante {0:s} conflicts with the " "exisiting coordinate. This will raise an error " - "in the next release. Use `.drop(\'{0:s}\')` to " - "index without attaching the indexer's " - "coordinate.".format(k), DeprecationWarning, - stacklevel=2) + "in the next release. " + "Use `.isel({0:s}=ind.drop(\'{0:s}\'))` to " + "index safely.".format(k), + FutureWarning, stacklevel=3) del coords[k] coord_list.append(coords) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 70747330a62..2cf0da8f7fd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -994,11 +994,9 @@ def test_isel_dataarray(self): self.assertDataArrayIdentical(actual['dim2'], data['dim2'].isel(dim2=np.arange(1, 4))) # make sure the coordinate confliction raises a warning - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") + with pytest.warns(FutureWarning) as w: actual = data.isel(dim2=indexing_da) assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) assert "Indexer's coordiante dim2 conflicts" in str(w[-1].message) # isel for the coordinate. Should not attach the coordinate From f78c9328c2c3606e87e7565ce6beda745b5fbe53 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 15:49:05 +0900 Subject: [PATCH 064/113] fix related to pytest.warns --- xarray/core/dataset.py | 3 +++ xarray/tests/test_dataset.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fe27bd619e5..22e1569da75 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1095,6 +1095,9 @@ def maybe_chunk(name, var, chunks): for k, v in self.variables.items()]) return self._replace_vars_and_dims(variables) + def _validate_indexers(self, indexers): + pass + def _get_indexers_coordinates(self, indexers): """ Extract coordinates from indexers. Returns an OrderedDict mapping from coordinate name to the diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2cf0da8f7fd..95c09c7969c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -996,7 +996,6 @@ def test_isel_dataarray(self): # make sure the coordinate confliction raises a warning with pytest.warns(FutureWarning) as w: actual = data.isel(dim2=indexing_da) - assert len(w) == 1 assert "Indexer's coordiante dim2 conflicts" in str(w[-1].message) # isel for the coordinate. Should not attach the coordinate From 1c027cd364dadf987df951fe6d3d6b359bb06404 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 16:23:13 +0900 Subject: [PATCH 065/113] Another fix related to warning. --- xarray/tests/test_dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 95c09c7969c..35382710c4b 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -994,9 +994,10 @@ def test_isel_dataarray(self): self.assertDataArrayIdentical(actual['dim2'], data['dim2'].isel(dim2=np.arange(1, 4))) # make sure the coordinate confliction raises a warning - with pytest.warns(FutureWarning) as w: + with pytest.warns(FutureWarning) as ws: actual = data.isel(dim2=indexing_da) - assert "Indexer's coordiante dim2 conflicts" in str(w[-1].message) + assert any(["Indexer's coordiante dim2 conflicts" in str(w.message) + for w in ws]) # isel for the coordinate. Should not attach the coordinate actual = data['dim2'].isel(dim2=indexing_da) From d11829f4c2175d535899b24a23983e9868b9c15a Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 17:01:35 +0900 Subject: [PATCH 066/113] Raise an Error for confusing indexing type --- xarray/core/dataset.py | 58 +++++++++++++++++++++++------------- xarray/tests/test_dataset.py | 17 +++++++++-- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 22e1569da75..b45dae87d20 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1096,7 +1096,42 @@ def maybe_chunk(name, var, chunks): return self._replace_vars_and_dims(variables) def _validate_indexers(self, indexers): - pass + """ Here we make sure + + indexer has a valid keys + + indexer is in a valid data type + + raise an Error for some confusing case. + """ + from .dataarray import DataArray + + invalid = [k for k in indexers if k not in self.dims] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + for k, v in iteritems(indexers): + dims = getattr(v, 'dims', None) + if (dims is not None and k not in dims and + any([d not in indexers for d in dims]) and + any([d in self.dims for d in dims])): + raise ValueError( + 'Trying to index along dimension {0:s} with a variable ' + 'with dimensions {1:s}, which is also a dimension of the ' + 'indexed array.'.format(k, str(dims))) + + # all indexers should be int, slice, np.ndarrays, or Variable + indexers_list = [] + for k, v in iteritems(indexers): + if isinstance(v, integer_types + (slice, Variable)): + pass + elif isinstance(v, DataArray): + v = v.variable + elif isinstance(v, tuple): + v = as_variable(v) + elif isinstance(v, Dataset): + raise TypeError('cannot use a Dataset as an indexer') + else: + v = np.asarray(v) + indexers_list.append((k, v)) + return indexers_list def _get_indexers_coordinates(self, indexers): """ Extract coordinates from indexers. @@ -1176,26 +1211,7 @@ def isel(self, drop=False, **indexers): Dataset.sel DataArray.isel """ - from .dataarray import DataArray - - invalid = [k for k in indexers if k not in self.dims] - if invalid: - raise ValueError("dimensions %r do not exist" % invalid) - - # all indexers should be int, slice, np.ndarrays, or Variable - indexers_list = [] - for k, v in iteritems(indexers): - if isinstance(v, integer_types + (slice, Variable)): - pass - elif isinstance(v, DataArray): - v = v.variable - elif isinstance(v, tuple): - v = as_variable(v) - elif isinstance(v, Dataset): - raise TypeError('cannot use a Dataset as an indexer') - else: - v = np.asarray(v) - indexers_list.append((k, v)) + indexers_list = self._validate_indexers(indexers) coord_vars = self._get_indexers_coordinates(indexers) coord_names = set(self._coord_names) | set(coord_vars) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 35382710c4b..5743474e99c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1008,10 +1008,11 @@ def test_isel_dataarray(self): indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': data['dim2'].values[1:4]}) self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") + with pytest.warns(FutureWarning) as ws: actual = data.isel(dim2=indexing_da) - assert len(w) == 0 # no warning + # does not warn + assert all(["Indexer's coordiante dim2 conflicts" not in + str(w.message) for w in ws]) # boolean data array with coordinate with the same name indexing_da = (indexing_da < 3) @@ -1048,6 +1049,16 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da) assert 'station' not in actual + def test_isel_dataarray_error(self): + data = create_test_data() + + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': np.random.randn(3)}) + with self.assertRaisesRegexp(ValueError, 'Trying to index along'): + data.isel(dim1=indexing_da) + # this should not raise an error + data.isel(dim1=indexing_da, dim2=indexing_da) + def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), From 0777128d7f52a8d7d0400fda009e72302c08f2af Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 17:40:21 +0900 Subject: [PATCH 067/113] Minor fix --- xarray/tests/test_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5743474e99c..8aa9d3e5f45 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1008,11 +1008,13 @@ def test_isel_dataarray(self): indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': data['dim2'].values[1:4]}) self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) + with pytest.warns(FutureWarning) as ws: actual = data.isel(dim2=indexing_da) # does not warn assert all(["Indexer's coordiante dim2 conflicts" not in str(w.message) for w in ws]) + warnings.warn('dummy', FutureWarning, stacklevel=3) # boolean data array with coordinate with the same name indexing_da = (indexing_da < 3) From f580c99a001dce57a9af1645b01617dc46785afd Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 27 Aug 2017 18:56:14 +0900 Subject: [PATCH 068/113] Test for indexing by a scalar coordinate. --- xarray/tests/test_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8aa9d3e5f45..bddb6516659 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1050,6 +1050,8 @@ def test_isel_dataarray(self): indexing_da = DataArray(3, dims=[], coords={'station': 2}) actual = data.isel(dim2=indexing_da) assert 'station' not in actual + actual = data.isel(dim2=indexing_da['station']) + assert 'station' not in actual def test_isel_dataarray_error(self): data = create_test_data() From 4ebe8521d34f9170d7332d89f41df4bc1c64657f Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 29 Aug 2017 09:27:58 +0900 Subject: [PATCH 069/113] Modified test --- xarray/tests/test_dataset.py | 8 +++++++- xarray/tests/test_variable.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index bddb6516659..7fef573a4bd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1055,7 +1055,6 @@ def test_isel_dataarray(self): def test_isel_dataarray_error(self): data = create_test_data() - indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': np.random.randn(3)}) with self.assertRaisesRegexp(ValueError, 'Trying to index along'): @@ -1063,6 +1062,13 @@ def test_isel_dataarray_error(self): # this should not raise an error data.isel(dim1=indexing_da, dim2=indexing_da) + # slice and vector mixed indexing resulting in the same dimension + another_data = DataArray(np.arange(60).reshape(3, 4, 5), + dims=['x', 'y', 'z']) + ind = DataArray([0, 1, 2], dims=['x']) + with self.assertRaisesRegexp(ValueError, 'Trying to index along'): + another_data.isel(y=ind) + def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index be4b4ba2472..30c13028f8b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -596,6 +596,21 @@ def test_getitem_fancy(self): expected = Variable(['y', 'x'], [[6]]) self.assertVariableIdentical(v_new, expected) + # slice and vector mixed indexing resulting in the same dimension + v = Variable(['x', 'y', 'z'], np.arange(60).reshape(3, 4, 5)) + ind = Variable(['x'], [0, 1, 2]) + v_new = v[:, ind] + expected = Variable(('x', 'z'), np.zeros((3, 5))) + expected[0] = v.data[0, 0] + expected[1] = v.data[1, 1] + expected[2] = v.data[2, 2] + self.assertVariableIdentical(v_new, expected) + + v = Variable(['x', 'y', 'z'], np.arange(60).reshape(3, 4, 5)) + ind = Variable(['x'], [0, 1]) + with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mis'): + v_new = v[:, ind] + def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) From 20f5cb95d9236994ef746ac2bd6a7cbbdaf78806 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 29 Aug 2017 14:11:01 +0900 Subject: [PATCH 070/113] Remove too specialized errorning --- xarray/core/dataset.py | 10 ---------- xarray/tests/test_dataset.py | 16 ---------------- xarray/tests/test_variable.py | 10 ++++++---- 3 files changed, 6 insertions(+), 30 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b45dae87d20..4c351a2823d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1107,16 +1107,6 @@ def _validate_indexers(self, indexers): if invalid: raise ValueError("dimensions %r do not exist" % invalid) - for k, v in iteritems(indexers): - dims = getattr(v, 'dims', None) - if (dims is not None and k not in dims and - any([d not in indexers for d in dims]) and - any([d in self.dims for d in dims])): - raise ValueError( - 'Trying to index along dimension {0:s} with a variable ' - 'with dimensions {1:s}, which is also a dimension of the ' - 'indexed array.'.format(k, str(dims))) - # all indexers should be int, slice, np.ndarrays, or Variable indexers_list = [] for k, v in iteritems(indexers): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7fef573a4bd..f6963ce2814 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1053,22 +1053,6 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da['station']) assert 'station' not in actual - def test_isel_dataarray_error(self): - data = create_test_data() - indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], - coords={'dim2': np.random.randn(3)}) - with self.assertRaisesRegexp(ValueError, 'Trying to index along'): - data.isel(dim1=indexing_da) - # this should not raise an error - data.isel(dim1=indexing_da, dim2=indexing_da) - - # slice and vector mixed indexing resulting in the same dimension - another_data = DataArray(np.arange(60).reshape(3, 4, 5), - dims=['x', 'y', 'z']) - ind = DataArray([0, 1, 2], dims=['x']) - with self.assertRaisesRegexp(ValueError, 'Trying to index along'): - another_data.isel(y=ind) - def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 30c13028f8b..d813e8a7241 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -606,10 +606,8 @@ def test_getitem_fancy(self): expected[2] = v.data[2, 2] self.assertVariableIdentical(v_new, expected) - v = Variable(['x', 'y', 'z'], np.arange(60).reshape(3, 4, 5)) - ind = Variable(['x'], [0, 1]) - with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mis'): - v_new = v[:, ind] + v_new = v[:, ind.data] + assert v_new.shape == (3, 3, 5) def test_getitem_error(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) @@ -626,6 +624,10 @@ def test_getitem_error(self): with self.assertRaisesRegexp(IndexError, '2-dimensional boolean'): v[dict(x=ind)] + v = Variable(['x', 'y', 'z'], np.arange(60).reshape(3, 4, 5)) + ind = Variable(['x'], [0, 1]) + with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mis'): + v_new = v[:, ind] class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) From ab08af8017212be7b75a9f522c5f518ab97ae82f Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 29 Aug 2017 18:30:20 +0900 Subject: [PATCH 071/113] Working with docs --- doc/indexing.rst | 37 ++++++++++++++++++++++------------ doc/whats-new.rst | 15 ++++++++++++++ xarray/tests/test_dataarray.py | 14 +++++++++++++ 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 378a04b3942..0a888589421 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,10 +11,16 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) -Similarly to pandas objects, xarray objects support both integer and label -based lookups along each dimension. However, xarray objects also have named -dimensions, so you can optionally use dimension names instead of relying on the -positional ordering of dimensions. +The basic way to access each element of xarray's multi-dimensional +object is to use Python `[obj]` syntax, such as `array[i, j]`. +As xarray objects can store coordinates corresponding to each dimension of the +array, label-based indexing similar to pandas object is also possible. +In label-based indexing, the element position i is automatically looked-up from +the coordinate values. + +Furthermore, the dimensions of xarray object have names and +you can also lookup the dimensions ordering by name, +instead of remembering the positional ordering of dimensions by yourself. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -58,9 +64,8 @@ Attributes are persisted in all indexing operations. .. warning:: Positional indexing deviates from the NumPy when indexing with multiple - arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`orthogonal`. - See :ref:`pointwise indexing` for how to achieve this functionality in - xarray. + arrays like ``arr[[0, 1], [0, 1]]``, as described in + :ref:`advanced_indexing`. xarray also supports label-based indexing, just like pandas. Because we use a :py:class:`pandas.Index` under the hood, label based indexing is very @@ -85,10 +90,10 @@ Setting values with label based indexing is also supported: arr -Indexing with labeled dimensions --------------------------------- +Indexing with dimension names +----------------------------- -With labeled dimensions, we do not have to rely on dimension order and can +With the dimension names, we do not have to rely on dimension order and can use them explicitly to slice data. There are two ways to do this: 1. Use a dictionary as the argument for array positional or label based array @@ -125,10 +130,16 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ + +Assignment +---------- + +As described later, + .. warning:: - Do not try to assign values when using any of the indexing methods ``isel``, - ``isel_points``, ``sel`` or ``sel_points``:: + Do not try to assign values when using any of the indexing methods ``isel`` + or ``sel``:: # DO NOT do this arr.isel(space=0) = 0 @@ -386,7 +397,7 @@ should still avoid assignment with chained indexing. .. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy -.. _orthogonal: +.. _advanced_indexing: Orthogonal (outer) vs. vectorized indexing ------------------------------------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7d06a49d486..9e8d6d5960e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -18,6 +18,21 @@ What's New v0.9.7 (unreleased) ------------------- +Backward Incompatible Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- xarray now supports vectorized indexing, where we consider the dimension of + indexer, e.g. `array.sel(x=ind)` with `ind.dims == ('y', )` . + This enables us more advanced indexing, including outer indexing, diagonal + indexing, as well as vectorized indexing. + Due to this change, existing uses of xarray objects to index other xarray + objects will break in some cases. + See *** for the details. + (:issue:`1444`, :issue:***, ) + By `Keisuke Fujii `_ and + `Stephan Hoyer `_. + + Enhancements ~~~~~~~~~~~~ diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e8e0bbd0a8c..0ed710f54a3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -741,11 +741,25 @@ def test_loc(self): self.assertDataArrayIdentical(da[:3, :4], da.loc[['a', 'b', 'c'], np.arange(4)]) self.assertDataArrayIdentical(da[:, :4], da.loc[:, self.ds['y'] < 4]) + + def test_loc_assign(self): + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + da = self.ds['foo'] + # assignment da.loc['a':'j'] = 0 self.assertTrue(np.all(da.values == 0)) da.loc[{'x': slice('a', 'j')}] = 2 self.assertTrue(np.all(da.values == 2)) + da.loc[{'x': slice('a', 'j')}] = 2 + self.assertTrue(np.all(da.values == 2)) + + # Multi dimensional case + da = DataArray(np.arange(12).reshape(3, 4), dims=['x', 'y']) + da.loc[0] = 0 + self.assertTrue(np.all(da.values[0, 0] == 0)) + self.assertTrue(np.all(da.values[0, 1] != 0)) + def test_loc_single_boolean(self): data = DataArray([0, 1], coords=[[True, False]]) self.assertEqual(data.loc[True], 0) From 92dded65d32773431b996da45148a65f9627d80b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 29 Aug 2017 23:26:12 +0900 Subject: [PATCH 072/113] Found a bug in as_variable --- xarray/core/variable.py | 3 ++- xarray/tests/test_dataarray.py | 34 ++++++++++++++++++++++++++++++++-- xarray/tests/test_variable.py | 7 +++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index dbdbe8af6a4..7b450ee7773 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -491,7 +491,8 @@ def _broadcast_indexes_vectorized(self, key): out_dims_set.add(dim) else: try: - variable = as_variable(value, name=dim) + variable = (value if isinstance(value, Variable) else + as_variable(value, name=dim)) except MissingDimensionsError: # change to better exception raise IndexError("Unlabeled multi-dimensional array " "cannot be used for indexing.") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 0ed710f54a3..bd49acb2e24 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -468,6 +468,31 @@ def test_getitem_coords(self): dims='x') self.assertDataArrayIdentical(expected, actual) + def test_getitem_dataarray(self): + # It should not conflict + da = DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y']) + ind = DataArray([[0, 1], [0, 1]], dims=['x', 'z']) + actual = da[ind] + self.assertArrayEqual(actual, da.values[[[0, 1], [0, 1]], :]) + + def test_setitem(self): + # basic indexing should work as numpy's indexing + tuples = [(0, 0), (0, slice(None, None)), + (slice(None, None), slice(None, None)), + (slice(None, None), 0), + ([1, 0], slice(None, None)), + (slice(None, None), [1, 0])] + for t in tuples: + expected = np.arange(6).reshape(3, 2) + orig = DataArray(np.arange(6).reshape(3, 2), + {'x': [1, 2, 3], 'y': ['a', 'b'], 'z': 4, + 'x2': ('x', ['a', 'b', 'c']), + 'y2': ('y', ['d', 'e'])}, + dims=['x', 'y']) + orig[t] = 1 + expected[t] = 1 + self.assertArrayEqual(orig.values, expected) + def test_attr_sources_multiindex(self): # make sure attr-style access for multi-index levels # returns DataArray objects @@ -755,10 +780,15 @@ def test_loc_assign(self): self.assertTrue(np.all(da.values == 2)) # Multi dimensional case + da = DataArray(np.arange(12).reshape(3, 4), dims=['x', 'y']) + da.loc[0, 0] = 0 + assert da.values[0, 0] == 0 + assert da.values[0, 1] != 0 + da = DataArray(np.arange(12).reshape(3, 4), dims=['x', 'y']) da.loc[0] = 0 - self.assertTrue(np.all(da.values[0, 0] == 0)) - self.assertTrue(np.all(da.values[0, 1] != 0)) + self.assertTrue(np.all(da.values[0] == np.zeros(4))) + assert da.values[1, 0] != 0 def test_loc_single_boolean(self): data = DataArray([0, 1], coords=[[True, False]]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d813e8a7241..5d2289291a2 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -538,6 +538,13 @@ def test_getitem_fancy(self): assert v_new.dims == ('a', 'b', 'y') self.assertArrayEqual(v_new, v_data[[[0, 1, 1], [1, 1, 0]], :]) + # It would be ok if indexed with the multi-dimensional array including + # the same name + ind = Variable(['x', 'b'], [[0, 1, 1], [1, 1, 0]]) + v_new = v[ind] + assert v_new.dims == ('x', 'b', 'y') + self.assertArrayEqual(v_new, v_data[[[0, 1, 1], [1, 1, 0]], :]) + ind = Variable(['a', 'b'], [[0, 1, 2], [2, 1, 0]]) v_new = v[dict(y=ind)] assert v_new.dims == ('x', 'a', 'b') From a6244243211702fdd72f18aeb9d251678edb432b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 00:04:40 +0900 Subject: [PATCH 073/113] Working with docs --- doc/indexing.rst | 101 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 0a888589421..b5595fbfbf1 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,16 +11,16 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) -The basic way to access each element of xarray's multi-dimensional -object is to use Python `[obj]` syntax, such as `array[i, j]`. +The most basic way to access each element of xarray's multi-dimensional +object is to use Python ``[obj]`` syntax, such as ``array[i, j]``. As xarray objects can store coordinates corresponding to each dimension of the array, label-based indexing similar to pandas object is also possible. -In label-based indexing, the element position i is automatically looked-up from -the coordinate values. +In label-based indexing, the element position ``i`` is automatically +looked-up from the coordinate values. -Furthermore, the dimensions of xarray object have names and -you can also lookup the dimensions ordering by name, -instead of remembering the positional ordering of dimensions by yourself. +Dimensions of xarray object have names and you can also lookup the dimensions +ordering by name, instead of remembering the positional ordering of dimensions +by yourself. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -43,6 +43,7 @@ below and summarized in this table: | | | ``arr.loc[dict(space='IA')]`` | ``ds.loc[dict(space='IA')]`` | +------------------+--------------+---------------------------------+--------------------------------+ + Positional indexing ------------------- @@ -131,10 +132,89 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ -Assignment ----------- +.. _advanced_indexing: + +Indexing multi-dimensional array +--------------------------------- + +As similar to numpy's nd-array, xarray supports two types of indexing, +`basic- and advanced-indexing`__. +However, our indexing rule differs from numpy's nd-array. + +__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html + + +Our indexing is basically orthogonal, i.e. +if you pass multiple integer sequences to an array, they work independently +along each dimension (similar to the way vector subscripts work in fortran). + +.. ipython:: python + + da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da + da[[0, 1], [1, 1]] + # Sequential indexing gives the same result. + da[[0, 1], [1, 1]] == da[[0, 1]][:, [1, 1]] + + +In order to make more advanced indexing, you can supply +:py:meth:`~xarray.DataArray` as indexers. +If :py:meth:`~xarray.DataArray` is provided as indexers, the dimension of the +resultant array is determined by the indexers' dimension names, + +.. ipython:: python + + ind_x = xr.DataArray([0, 1], dims=['x']) + ind_y = xr.DataArray([0, 1], dims=['y']) + da[ind_x, ind_y] # orthogonal indexing + da[ind_x, ind_x] # vectorized indexing + +If you just provide slices or sequences, which do not have named-dimensions, +they will be understood as the same dimension to index along. + +.. ipython:: python + + # Because [0, 1] is used to index along dimension 'x', + # it is assumed to have dimension 'x' + da[[0, 1], ind_x] -As described later, + +Furthermore, you can use multi-dimensional :py:meth:`~xarray.DataArray` +as indexers, + +.. ipython:: python + + ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) + da[ind] + +To summarize, our indexing rule is based on our `broadcasting`__ scheme. +For the above example, the result shape will be + +__ :py:meth:`~xarray.broadcast` + + +These advanced indexing also works with ``isel``, ``loc``, and ``sel``. + +.. ipython:: python + + ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) + da.isel(y=ind) # same to da[:, ind] + + ind = xr.DataArray([['a', 'b'], ['b', 'a']], dims=['a', 'b']) + da.loc[:, ind] # same to da.sel(y=ind) + + +Assigning values +---------------- + +As similar to `numpy's nd-array`__, the value assignment behaves differently +depending on whether basic- or advanced-indexing. + +__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#detailed-notes + +1. Basic indexing. + Indexer consists of slice, ellipse, or integer. Not a sequences of integer. .. warning:: @@ -397,7 +477,6 @@ should still avoid assignment with chained indexing. .. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy -.. _advanced_indexing: Orthogonal (outer) vs. vectorized indexing ------------------------------------------ From a4cd724a8808fbc3c68db47a260f75a33c420469 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 00:24:46 +0900 Subject: [PATCH 074/113] Enable indexing IndexVariable by multi-dimensional Variable. --- doc/indexing.rst | 37 ++++++++++++---------------------- xarray/core/variable.py | 8 +++++--- xarray/tests/test_dataarray.py | 8 ++++++++ xarray/tests/test_variable.py | 7 +++++++ 4 files changed, 33 insertions(+), 27 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 0a888589421..378a04b3942 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,16 +11,10 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) -The basic way to access each element of xarray's multi-dimensional -object is to use Python `[obj]` syntax, such as `array[i, j]`. -As xarray objects can store coordinates corresponding to each dimension of the -array, label-based indexing similar to pandas object is also possible. -In label-based indexing, the element position i is automatically looked-up from -the coordinate values. - -Furthermore, the dimensions of xarray object have names and -you can also lookup the dimensions ordering by name, -instead of remembering the positional ordering of dimensions by yourself. +Similarly to pandas objects, xarray objects support both integer and label +based lookups along each dimension. However, xarray objects also have named +dimensions, so you can optionally use dimension names instead of relying on the +positional ordering of dimensions. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -64,8 +58,9 @@ Attributes are persisted in all indexing operations. .. warning:: Positional indexing deviates from the NumPy when indexing with multiple - arrays like ``arr[[0, 1], [0, 1]]``, as described in - :ref:`advanced_indexing`. + arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`orthogonal`. + See :ref:`pointwise indexing` for how to achieve this functionality in + xarray. xarray also supports label-based indexing, just like pandas. Because we use a :py:class:`pandas.Index` under the hood, label based indexing is very @@ -90,10 +85,10 @@ Setting values with label based indexing is also supported: arr -Indexing with dimension names ------------------------------ +Indexing with labeled dimensions +-------------------------------- -With the dimension names, we do not have to rely on dimension order and can +With labeled dimensions, we do not have to rely on dimension order and can use them explicitly to slice data. There are two ways to do this: 1. Use a dictionary as the argument for array positional or label based array @@ -130,16 +125,10 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ - -Assignment ----------- - -As described later, - .. warning:: - Do not try to assign values when using any of the indexing methods ``isel`` - or ``sel``:: + Do not try to assign values when using any of the indexing methods ``isel``, + ``isel_points``, ``sel`` or ``sel_points``:: # DO NOT do this arr.isel(space=0) = 0 @@ -397,7 +386,7 @@ should still avoid assignment with chained indexing. .. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy -.. _advanced_indexing: +.. _orthogonal: Orthogonal (outer) vs. vectorized indexing ------------------------------------------ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7b450ee7773..323219faab2 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1389,10 +1389,12 @@ def chunk(self, chunks=None, name=None, lock=False): return self.copy(deep=False) def __getitem__(self, key): - dims, index_tuple, _ = self._broadcast_indexes(key) + dims, index_tuple, new_order = self._broadcast_indexes(key) if len(dims) > 1: - raise IndexError('Multiple dimension array cannot be used for ' - 'indexing IndexVariable: {}'.format(key)) + # returns Variable rather than IndexVariable if multi-dimensional + return Variable(self.dims, self.data, self._attrs, self._encoding, + fastpath=True)[key] + values = self._indexable_data[index_tuple] if getattr(values, 'ndim', 0) == 0: return Variable((), values, self._attrs, self._encoding) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bd49acb2e24..e2e48d8c5e3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -475,6 +475,14 @@ def test_getitem_dataarray(self): actual = da[ind] self.assertArrayEqual(actual, da.values[[[0, 1], [0, 1]], :]) + da = DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + ind = xr.DataArray([[0, 1], [0, 1]], dims=['X', 'Y']) + actual = da[ind] + expected = da.values[[[0, 1], [0, 1]], :] + self.assertArrayEqual(actual, expected) + assert actual.dims == ('X', 'Y', 'y') + def test_setitem(self): # basic indexing should work as numpy's indexing tuples = [(0, 0), (0, slice(None, None)), diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5d2289291a2..853d2d93658 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -77,6 +77,13 @@ def test_getitem_1d(self): assert v_new.dims == ('x', ) self.assertArrayEqual(v_new, v._data) + # 1d-variable should be indexable by multi-dimensional Variable + ind = Variable(('a', 'b'), [[0, 1], [0, 1]]) + v_new = v[ind] + assert v_new.dims == ('a', 'b') + expected = np.array(v._data)[([0, 1], [0, 1]), ] + self.assertArrayEqual(v_new, expected) + def _assertIndexedLikeNDArray(self, variable, expected_value0, expected_dtype=None): """Given a 1-dimensional variable, verify that the variable is indexed From f66c9b68a9f415fcc9b482780f36f1395e144f24 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 20:55:06 +0900 Subject: [PATCH 075/113] Found a bug in indexing np.ndarray --- doc/indexing.rst | 37 ++++++++++++++++++++++------------ doc/whats-new.rst | 2 +- xarray/tests/test_dataarray.py | 17 ++++++++++++++++ 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 378a04b3942..0a888589421 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,10 +11,16 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) -Similarly to pandas objects, xarray objects support both integer and label -based lookups along each dimension. However, xarray objects also have named -dimensions, so you can optionally use dimension names instead of relying on the -positional ordering of dimensions. +The basic way to access each element of xarray's multi-dimensional +object is to use Python `[obj]` syntax, such as `array[i, j]`. +As xarray objects can store coordinates corresponding to each dimension of the +array, label-based indexing similar to pandas object is also possible. +In label-based indexing, the element position i is automatically looked-up from +the coordinate values. + +Furthermore, the dimensions of xarray object have names and +you can also lookup the dimensions ordering by name, +instead of remembering the positional ordering of dimensions by yourself. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -58,9 +64,8 @@ Attributes are persisted in all indexing operations. .. warning:: Positional indexing deviates from the NumPy when indexing with multiple - arrays like ``arr[[0, 1], [0, 1]]``, as described in :ref:`orthogonal`. - See :ref:`pointwise indexing` for how to achieve this functionality in - xarray. + arrays like ``arr[[0, 1], [0, 1]]``, as described in + :ref:`advanced_indexing`. xarray also supports label-based indexing, just like pandas. Because we use a :py:class:`pandas.Index` under the hood, label based indexing is very @@ -85,10 +90,10 @@ Setting values with label based indexing is also supported: arr -Indexing with labeled dimensions --------------------------------- +Indexing with dimension names +----------------------------- -With labeled dimensions, we do not have to rely on dimension order and can +With the dimension names, we do not have to rely on dimension order and can use them explicitly to slice data. There are two ways to do this: 1. Use a dictionary as the argument for array positional or label based array @@ -125,10 +130,16 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ + +Assignment +---------- + +As described later, + .. warning:: - Do not try to assign values when using any of the indexing methods ``isel``, - ``isel_points``, ``sel`` or ``sel_points``:: + Do not try to assign values when using any of the indexing methods ``isel`` + or ``sel``:: # DO NOT do this arr.isel(space=0) = 0 @@ -386,7 +397,7 @@ should still avoid assignment with chained indexing. .. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy -.. _orthogonal: +.. _advanced_indexing: Orthogonal (outer) vs. vectorized indexing ------------------------------------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9e8d6d5960e..11829754920 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,7 +28,7 @@ Backward Incompatible Changes Due to this change, existing uses of xarray objects to index other xarray objects will break in some cases. See *** for the details. - (:issue:`1444`, :issue:***, ) + (:issue:`1444`, :issue:`1436`, ) By `Keisuke Fujii `_ and `Stephan Hoyer `_. diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e2e48d8c5e3..af911f9f43a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -483,6 +483,23 @@ def test_getitem_dataarray(self): self.assertArrayEqual(actual, expected) assert actual.dims == ('X', 'Y', 'y') + # boolean indexing + ind = xr.DataArray([True, True, False], dims=['x']) + self.assertDataArrayEqual(da[ind], da[[0, 1], :]) + self.assertDataArrayEqual(da[ind], da[[0, 1]]) + self.assertDataArrayEqual(da[ind], da[ind.values]) + + ind = xr.DataArray([True, True, False], dims=['a'], + coords={'a': [0, 1, 2]}) + actual = da[ind] + assert 'a' in actual + self.assertArrayEqual(actual['a'], [0, 1]) + + # make sure we can index a np.ndarray + array = np.arange(3) + actual = array[ind] + self.assertArrayEqual(actual, [0, 1]) + def test_setitem(self): # basic indexing should work as numpy's indexing tuples = [(0, 0), (0, slice(None, None)), From 24309c41153094ef099318d6629744d574174078 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 22:57:34 +0900 Subject: [PATCH 076/113] Added a test for boolean-DataArray indexing. --- xarray/tests/test_dataarray.py | 5 ----- xarray/tests/test_variable.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index af911f9f43a..0fbaf66766f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -495,11 +495,6 @@ def test_getitem_dataarray(self): assert 'a' in actual self.assertArrayEqual(actual['a'], [0, 1]) - # make sure we can index a np.ndarray - array = np.arange(3) - actual = array[ind] - self.assertArrayEqual(actual, [0, 1]) - def test_setitem(self): # basic indexing should work as numpy's indexing tuples = [(0, 0), (0, slice(None, None)), diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 853d2d93658..d4b9388f8db 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -77,6 +77,8 @@ def test_getitem_1d(self): assert v_new.dims == ('x', ) self.assertArrayEqual(v_new, v._data) + def test_getitem_1d_fancy(self): + v = self.cls(['x'], [0, 1, 2]) # 1d-variable should be indexable by multi-dimensional Variable ind = Variable(('a', 'b'), [[0, 1], [0, 1]]) v_new = v[ind] @@ -1386,6 +1388,18 @@ def test_getitem_fancy(self): pytest.xfail("vindex from latest dask is required") super(TestVariableWithDask, self).test_getitem_fancy() + def test_getitem_1d_fancy(self): + import dask + if LooseVersion(dask.__version__) <= LooseVersion('0.15.1'): + pytest.xfail("vindex from latest dask is required") + super(TestVariableWithDask, self).test_getitem_1d_fancy() + + def test_getitem_fancy(self): + import dask + if LooseVersion(dask.__version__) <= LooseVersion('0.15.1'): + pytest.xfail("vindex from latest dask is required") + super(TestVariableWithDask, self).test_getitem_fancy() + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From 9cbaff95cc4c848053c0ff6a1a2dc1cc463835f3 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 23:26:41 +0900 Subject: [PATCH 077/113] Make sure assignment certainly works. --- doc/indexing.rst | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index b5595fbfbf1..d5ae731d225 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,8 +11,12 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) + +The point of xarray is to introduce a numpy-ndarray-like multidimensional array object into a powerful pandas's flexible data handling scheme. +We provide several types (say, numpy-like and pandas-like, and more advanced) indexing functionalities. + The most basic way to access each element of xarray's multi-dimensional -object is to use Python ``[obj]`` syntax, such as ``array[i, j]``. +object is to use Python ``[obj]`` syntax, such as ``array[i, j]``, where ``i`` and ``j`` are both integers. As xarray objects can store coordinates corresponding to each dimension of the array, label-based indexing similar to pandas object is also possible. In label-based indexing, the element position ``i`` is automatically @@ -44,6 +48,12 @@ below and summarized in this table: +------------------+--------------+---------------------------------+--------------------------------+ +More advanced indexing is also possible for all the four types of indexings by +supplying :py:class:`~xarray.DataArray` objects as indexer. +See :ref:`advanced_indexing` for the details. + + + Positional indexing ------------------- @@ -171,7 +181,7 @@ resultant array is determined by the indexers' dimension names, da[ind_x, ind_x] # vectorized indexing If you just provide slices or sequences, which do not have named-dimensions, -they will be understood as the same dimension to index along. +they will be understood as the same dimension which is indexed along. .. ipython:: python @@ -188,10 +198,8 @@ as indexers, ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) da[ind] -To summarize, our indexing rule is based on our `broadcasting`__ scheme. -For the above example, the result shape will be - -__ :py:meth:`~xarray.broadcast` +To summarize, our indexing rule is based on our :ref:`compute.broadcasting` +scheme. These advanced indexing also works with ``isel``, ``loc``, and ``sel``. From a5c7766719946c72114a4a3aa34f6f92da10e4dc Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 30 Aug 2017 23:46:46 +0900 Subject: [PATCH 078/113] Added assignment section --- doc/indexing.rst | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index d5ae731d225..5c48d2ee9fe 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -216,13 +216,29 @@ These advanced indexing also works with ``isel``, ``loc``, and ``sel``. Assigning values ---------------- -As similar to `numpy's nd-array`__, the value assignment behaves differently -depending on whether basic- or advanced-indexing. +As similar to ``numpy's nd-array``, the value assignment behaves differently +depending on whether `basic- or advanced-indexing`__. -__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#detailed-notes +__ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays -1. Basic indexing. +1. Basic indexing Indexer consists of slice, ellipse, or integer. Not a sequences of integer. + By basic indexing, you can select a subset of an array to assign values. + +.. ipython:: python + + da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da + da[0, 0] = -1 # Assign -1 to one element + da + + da[0] = -2 # The shape is different but broadcastable + da + + da.loc[:, 'a'] = -3 # assignment through label-based indexing is also possible + da + .. warning:: @@ -232,12 +248,11 @@ __ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#detailed-note # DO NOT do this arr.isel(space=0) = 0 - Depending on whether the underlying numpy indexing returns a copy or a - view, the method will fail, and when it fails, **it will fail - silently**. Instead, you should use normal index assignment:: - - # this is safe - arr[dict(space=0)] = 0 +2. Advanced indexing + If the underlying indexing is advanced, indexing returns a copy of the + array not a view. + In this case, the method will fail, and when it fails, **it will fail + silently**. .. _pointwise indexing: From f2421666f5864befd89d46c0575548d452c54ab7 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 31 Aug 2017 18:30:55 +0900 Subject: [PATCH 079/113] pep8 --- xarray/backends/scipy_.py | 2 +- xarray/core/npcompat.py | 13 ++-- xarray/core/variable.py | 16 ++--- xarray/tests/test_dataset.py | 115 ++++++++++++++++++++++++++++++++-- xarray/tests/test_variable.py | 15 ++--- 5 files changed, 132 insertions(+), 29 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 5b5dcd461f9..0c15085c618 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -11,7 +11,7 @@ from ..core.pycompat import iteritems, OrderedDict, basestring from ..core.utils import (Frozen, FrozenOrderedDict, NdimSizeLenMixin, DunderArrayMixin) -from ..core.indexing import NumpyIndexingAdapter, OuterIndexer, to_tuple +from ..core.indexing import NumpyIndexingAdapter from .common import WritableCFDataStore, DataStorePickleMixin from .netcdf3 import (is_valid_nc3_name, encode_nc3_attr_value, diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 02c8bdf5447..a32fb17d5a6 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -392,7 +392,6 @@ def nancumprod(a, axis=None, dtype=None, out=None): a, mask = _replace_nan(a, 1) return np.cumprod(a, axis=axis, dtype=dtype, out=out) - def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False): """ Normalizes an axis argument into a tuple of non-negative integer axes. @@ -412,8 +411,8 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False): axis : int, iterable of int The un-normalized index or indices of the axis. ndim : int - The number of dimensions of the array that `axis` should be normalized - against. + The number of dimensions of the array that `axis` should be + normalized against. argname : str, optional A prefix to put before the error message, typically the name of the argument. @@ -443,12 +442,12 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False): axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis) if not allow_duplicate and len(set(axis)) != len(axis): if argname: - raise ValueError('repeated axis in `{}` argument'.format(argname)) + raise ValueError('repeated axis in `{}` argument'.format( + argname)) else: raise ValueError('repeated axis') return axis - def moveaxis(a, source, destination): """ Move axes of an array to new positions. @@ -464,8 +463,8 @@ def moveaxis(a, source, destination): source : int or sequence of int Original positions of the axes to move. These must be unique. destination : int or sequence of int - Destination positions for each of the original axes. These must also be - unique. + Destination positions for each of the original axes. These must + also be unique. Returns ------- diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 323219faab2..d565a181549 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -500,7 +500,8 @@ def _broadcast_indexes_vectorized(self, key): if variable.dtype.kind == 'b': # boolean indexing case if variable.ndim > 1: raise IndexError("{}-dimensional boolean indexing is " - "not supported. ".format(variable.ndim)) + "not supported. ".format( + variable.ndim)) (variable,) = variable._nonzero() variables.append(variable) @@ -516,9 +517,9 @@ def _broadcast_indexes_vectorized(self, key): if dim in variable_dims: # We only convert slice objects to variables if they share # a dimension with at least one other variable. Otherwise, - # we can equivalently leave them as slices and transpose the - # result. This is significantly faster/more efficient for - # most array backends. + # we can equivalently leave them as slices aknd transpose + # the result. This is significantly faster/more efficient + # for most array backends. values = np.arange(*value.indices(self.sizes[dim])) variables.insert(i - len(slices), Variable((dim,), values)) else: @@ -585,7 +586,8 @@ def __setitem__(self, key, value): 'broadcast to indexing result with %s dimensions' % (value.shape, len(dims))) - value = value[(len(dims) - value.ndim) * (np.newaxis,) + (Ellipsis,)] + value = value[(len(dims) - value.ndim) * (np.newaxis,) + + (Ellipsis,)] value = moveaxis(value, new_order, range(len(new_order))) self._indexable_data[index_tuple] = value @@ -929,8 +931,8 @@ def set_dims(self, dims, shape=None): missing_dims = set(self.dims) - set(dims) if missing_dims: - raise ValueError('new dimensions %r must be a superset of existing ' - 'dimensions %r' % (dims, self.dims)) + raise ValueError('new dimensions %r must be a superset of ' + 'existing dimensions %r' % (dims, self.dims)) self_dims = set(self.dims) expanded_dims = tuple( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f6963ce2814..48f2afeffdb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1113,6 +1113,115 @@ def test_sel_dataarray(self): self.assertDataArrayEqual(actual['new_dim'].drop('dim2'), ind['new_dim']) + def test_sel_equivalent_to_reindex_like(self): + data = create_test_data() + data['letters'] = ('dim3', 10 * ['a']) + + expected = data.isel(dim1=slice(10), time=slice(13)) + actual = data.reindex_like(expected) + self.assertDatasetIdentical(actual, expected) + actual_sel = data.sel(dim1=expected['dim1'], time=expected['time']) + self.assertDatasetIdentical(actual, actual_sel) + + expected = data.copy(deep=True) + expected['dim3'] = ('dim3', list('cdefghijkl')) + expected['var3'][:-2] = expected['var3'][2:] + expected['var3'][-2:] = np.nan + expected['letters'] = expected['letters'].astype(object) + expected['letters'][-2:] = np.nan + expected['numbers'] = expected['numbers'].astype(float) + expected['numbers'][:-2] = expected['numbers'][2:].values + expected['numbers'][-2:] = np.nan + actual = data.reindex_like(expected) + self.assertDatasetIdentical(actual, expected) + actual_sel = data.sel(**{d: expected[d] for d in expected.dims}) + self.assertDatasetIdentical(actual, actual_sel) + + def _test_sel_equivalent_to_reindex(self): + data = create_test_data() + self.assertDatasetIdentical(data, data.reindex()) + + expected = data.assign_coords(dim1=data['dim1']) + actual = data.reindex(dim1=data['dim1']) + self.assertDatasetIdentical(actual, expected) + + actual = data.reindex(dim1=data['dim1'].values) + self.assertDatasetIdentical(actual, expected) + + actual = data.reindex(dim1=data['dim1'].to_index()) + self.assertDatasetIdentical(actual, expected) + + with self.assertRaisesRegexp( + ValueError, 'cannot reindex or align along dimension'): + data.reindex(dim1=data['dim1'][:5]) + + expected = data.isel(dim2=slice(5)) + actual = data.reindex(dim2=data['dim2'][:5]) + self.assertDatasetIdentical(actual, expected) + + # test dict-like argument + actual = data.reindex({'dim2': data['dim2']}) + expected = data + self.assertDatasetIdentical(actual, expected) + with self.assertRaisesRegexp(ValueError, 'cannot specify both'): + data.reindex({'x': 0}, x=0) + with self.assertRaisesRegexp(ValueError, 'dictionary'): + data.reindex('foo') + + # invalid dimension + with self.assertRaisesRegexp(ValueError, 'invalid reindex dim'): + data.reindex(invalid=0) + + # out of order + expected = data.sel(dim2=data['dim2'][:5:-1]) + actual = data.reindex(dim2=data['dim2'][:5:-1]) + self.assertDatasetIdentical(actual, expected) + + # regression test for #279 + expected = Dataset({'x': ('time', np.random.randn(5))}, + {'time': range(5)}) + time2 = DataArray(np.arange(5), dims="time2") + actual = expected.reindex(time=time2) + self.assertDatasetIdentical(actual, expected) + + # another regression test + ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}, + {'x': range(3), 'y': range(4)}) + expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2)))}, + {'x': [0, 1, 3], 'y': [0, 1]}) + expected['foo'][-1] = np.nan + actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) + self.assertDatasetIdentical(expected, actual) + + def _test_sel_equivalent_to_reindex_variables_copied(self): + data = create_test_data() + reindexed_data = data.reindex(copy=False) + for k in data.variables: + assert reindexed_data.variables[k] is not data.variables[k] + + def _test_sel_equivalent_to_reindex_method(self): + ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) + y = [-0.5, 0.5, 1.5] + actual = ds.reindex(y=y, method='backfill') + expected = Dataset({'x': ('y', [10, 20, np.nan]), 'y': y}) + self.assertDatasetIdentical(expected, actual) + + if pd.__version__ >= '0.17': + actual = ds.reindex(y=y, method='backfill', tolerance=0.1) + expected = Dataset({'x': ('y', 3 * [np.nan]), 'y': y}) + self.assertDatasetIdentical(expected, actual) + else: + with self.assertRaisesRegexp(TypeError, 'tolerance'): + ds.reindex(y=y, method='backfill', tolerance=0.1) + + actual = ds.reindex(y=y, method='pad') + expected = Dataset({'x': ('y', [np.nan, 10, 20]), 'y': y}) + self.assertDatasetIdentical(expected, actual) + + alt = Dataset({'y': y}) + actual = ds.reindex_like(alt, method='pad') + self.assertDatasetIdentical(expected, actual) + def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1}) @@ -1290,9 +1399,6 @@ def test_sel_fancy(self): c=idx_3['c']) self.assertDatasetIdentical(expected, actual) - # Multi Dimensional indexers - #data.sel(x=[]) - # test from sel_points data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) data.coords.update({'x': [0, 1, 2], 'y': [0, 1, 2]}) @@ -1307,7 +1413,8 @@ def test_sel_fancy(self): expected.coords.update({'x': ('points', [0, 1, 2]), 'y': ('points', [0, 1, 2])}) actual = data.sel(x=Variable(('points', ), [0.1, 1.1, 2.5]), - y=Variable(('points', ), [0, 1.2, 2.0]), method='pad') + y=Variable(('points', ), [0, 1.2, 2.0]), + method='pad') self.assertDatasetIdentical(expected, actual) idx_x = DataArray([0, 1, 2], dims=['a'], coords={'a': ['a', 'b', 'c']}) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d4b9388f8db..914a33ad1d0 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -240,7 +240,8 @@ def test_pandas_data(self): self.assertEqual(v[0].values, v.values[0]) def test_pandas_period_index(self): - v = self.cls(['x'], pd.period_range(start='2000', periods=20, freq='B')) + v = self.cls(['x'], pd.period_range(start='2000', periods=20, + freq='B')) v = v.load() # for dask-based Variable self.assertEqual(v[0], pd.Period('2000', freq='B')) assert "Period('2000-01-03', 'B')" in repr(v) @@ -643,7 +644,8 @@ def test_getitem_error(self): v = Variable(['x', 'y', 'z'], np.arange(60).reshape(3, 4, 5)) ind = Variable(['x'], [0, 1]) with self.assertRaisesRegexp(IndexError, 'Dimensions of indexers mis'): - v_new = v[:, ind] + v[:, ind] + class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) @@ -1394,12 +1396,6 @@ def test_getitem_1d_fancy(self): pytest.xfail("vindex from latest dask is required") super(TestVariableWithDask, self).test_getitem_1d_fancy() - def test_getitem_fancy(self): - import dask - if LooseVersion(dask.__version__) <= LooseVersion('0.15.1'): - pytest.xfail("vindex from latest dask is required") - super(TestVariableWithDask, self).test_getitem_fancy() - class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) @@ -1566,10 +1562,9 @@ def test_full_like(self): self.assertEquals(expect.dtype, bool) self.assertVariableIdentical(expect, full_like(orig, True, dtype=bool)) - @requires_dask def test_full_like_dask(self): - orig = Variable(dims=('x', 'y'), data=[[1.5 ,2.0], [3.1, 4.3]], + orig = Variable(dims=('x', 'y'), data=[[1.5, 2.0], [3.1, 4.3]], attrs={'foo': 'bar'}).chunk(((1, 1), (2,))) def check(actual, expect_dtype, expect_values): From bff18f08b1548472318f4efeec1269212964f64c Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 31 Aug 2017 21:08:31 +0900 Subject: [PATCH 080/113] Remove unused tests. --- xarray/tests/test_dataset.py | 109 ----------------------------------- 1 file changed, 109 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 48f2afeffdb..88058cda5f2 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1113,115 +1113,6 @@ def test_sel_dataarray(self): self.assertDataArrayEqual(actual['new_dim'].drop('dim2'), ind['new_dim']) - def test_sel_equivalent_to_reindex_like(self): - data = create_test_data() - data['letters'] = ('dim3', 10 * ['a']) - - expected = data.isel(dim1=slice(10), time=slice(13)) - actual = data.reindex_like(expected) - self.assertDatasetIdentical(actual, expected) - actual_sel = data.sel(dim1=expected['dim1'], time=expected['time']) - self.assertDatasetIdentical(actual, actual_sel) - - expected = data.copy(deep=True) - expected['dim3'] = ('dim3', list('cdefghijkl')) - expected['var3'][:-2] = expected['var3'][2:] - expected['var3'][-2:] = np.nan - expected['letters'] = expected['letters'].astype(object) - expected['letters'][-2:] = np.nan - expected['numbers'] = expected['numbers'].astype(float) - expected['numbers'][:-2] = expected['numbers'][2:].values - expected['numbers'][-2:] = np.nan - actual = data.reindex_like(expected) - self.assertDatasetIdentical(actual, expected) - actual_sel = data.sel(**{d: expected[d] for d in expected.dims}) - self.assertDatasetIdentical(actual, actual_sel) - - def _test_sel_equivalent_to_reindex(self): - data = create_test_data() - self.assertDatasetIdentical(data, data.reindex()) - - expected = data.assign_coords(dim1=data['dim1']) - actual = data.reindex(dim1=data['dim1']) - self.assertDatasetIdentical(actual, expected) - - actual = data.reindex(dim1=data['dim1'].values) - self.assertDatasetIdentical(actual, expected) - - actual = data.reindex(dim1=data['dim1'].to_index()) - self.assertDatasetIdentical(actual, expected) - - with self.assertRaisesRegexp( - ValueError, 'cannot reindex or align along dimension'): - data.reindex(dim1=data['dim1'][:5]) - - expected = data.isel(dim2=slice(5)) - actual = data.reindex(dim2=data['dim2'][:5]) - self.assertDatasetIdentical(actual, expected) - - # test dict-like argument - actual = data.reindex({'dim2': data['dim2']}) - expected = data - self.assertDatasetIdentical(actual, expected) - with self.assertRaisesRegexp(ValueError, 'cannot specify both'): - data.reindex({'x': 0}, x=0) - with self.assertRaisesRegexp(ValueError, 'dictionary'): - data.reindex('foo') - - # invalid dimension - with self.assertRaisesRegexp(ValueError, 'invalid reindex dim'): - data.reindex(invalid=0) - - # out of order - expected = data.sel(dim2=data['dim2'][:5:-1]) - actual = data.reindex(dim2=data['dim2'][:5:-1]) - self.assertDatasetIdentical(actual, expected) - - # regression test for #279 - expected = Dataset({'x': ('time', np.random.randn(5))}, - {'time': range(5)}) - time2 = DataArray(np.arange(5), dims="time2") - actual = expected.reindex(time=time2) - self.assertDatasetIdentical(actual, expected) - - # another regression test - ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}, - {'x': range(3), 'y': range(4)}) - expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2)))}, - {'x': [0, 1, 3], 'y': [0, 1]}) - expected['foo'][-1] = np.nan - actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) - self.assertDatasetIdentical(expected, actual) - - def _test_sel_equivalent_to_reindex_variables_copied(self): - data = create_test_data() - reindexed_data = data.reindex(copy=False) - for k in data.variables: - assert reindexed_data.variables[k] is not data.variables[k] - - def _test_sel_equivalent_to_reindex_method(self): - ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) - y = [-0.5, 0.5, 1.5] - actual = ds.reindex(y=y, method='backfill') - expected = Dataset({'x': ('y', [10, 20, np.nan]), 'y': y}) - self.assertDatasetIdentical(expected, actual) - - if pd.__version__ >= '0.17': - actual = ds.reindex(y=y, method='backfill', tolerance=0.1) - expected = Dataset({'x': ('y', 3 * [np.nan]), 'y': y}) - self.assertDatasetIdentical(expected, actual) - else: - with self.assertRaisesRegexp(TypeError, 'tolerance'): - ds.reindex(y=y, method='backfill', tolerance=0.1) - - actual = ds.reindex(y=y, method='pad') - expected = Dataset({'x': ('y', [np.nan, 10, 20]), 'y': y}) - self.assertDatasetIdentical(expected, actual) - - alt = Dataset({'y': y}) - actual = ds.reindex_like(alt, method='pad') - self.assertDatasetIdentical(expected, actual) - def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1}) From 21c11c48302a7b7396ffceaae571c06e3c09ff5e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 31 Aug 2017 22:38:31 +0900 Subject: [PATCH 081/113] Add more docs. --- doc/indexing.rst | 408 ++++++++++++++++++++++------------------------- 1 file changed, 194 insertions(+), 214 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 5c48d2ee9fe..9b4e7effffe 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -18,13 +18,12 @@ We provide several types (say, numpy-like and pandas-like, and more advanced) in The most basic way to access each element of xarray's multi-dimensional object is to use Python ``[obj]`` syntax, such as ``array[i, j]``, where ``i`` and ``j`` are both integers. As xarray objects can store coordinates corresponding to each dimension of the -array, label-based indexing similar to pandas object is also possible. +array, label-based indexing similar to ``pandas.DataFrame.loc`` is also possible. In label-based indexing, the element position ``i`` is automatically looked-up from the coordinate values. Dimensions of xarray object have names and you can also lookup the dimensions -ordering by name, instead of remembering the positional ordering of dimensions -by yourself. +by name, instead of remembering the positional ordering of dimensions by yourself. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -48,7 +47,7 @@ below and summarized in this table: +------------------+--------------+---------------------------------+--------------------------------+ -More advanced indexing is also possible for all the four types of indexings by +More advanced indexing is also possible for all the methods by supplying :py:class:`~xarray.DataArray` objects as indexer. See :ref:`advanced_indexing` for the details. @@ -84,8 +83,13 @@ fast. To do label based indexing, use the :py:attr:`~xarray.DataArray.loc` attri .. ipython:: python + # Coordinate 'time' arr.loc['2000-01-01':'2000-01-02', 'IA'] +In this example, the selected is a subpart of the array +in the range '2000-01-01':'2000-01-02' along the first coordinate `time` +and with 'IA' value from the second coordinate `space`. + You can perform any of the label indexing operations `supported by pandas`__, including indexing with individual, slices and arrays of labels, as well as indexing with boolean arrays. Like pandas, label based indexing in xarray is @@ -100,6 +104,13 @@ Setting values with label based indexing is also supported: arr.loc['2000-01-01', ['IL', 'IN']] = -10 arr +.. note:: + Like indexing in numpy ndarray __, + depending on whether indexing returns view or copies, setting value + fails. For the details of the value assignment, see :ref:`assigning_values`. + + __ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays + Indexing with dimension names ----------------------------- @@ -142,146 +153,55 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ -.. _advanced_indexing: - -Indexing multi-dimensional array ---------------------------------- - -As similar to numpy's nd-array, xarray supports two types of indexing, -`basic- and advanced-indexing`__. -However, our indexing rule differs from numpy's nd-array. - -__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html - - -Our indexing is basically orthogonal, i.e. -if you pass multiple integer sequences to an array, they work independently -along each dimension (similar to the way vector subscripts work in fortran). - -.. ipython:: python - - da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], - coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) - da - da[[0, 1], [1, 1]] - # Sequential indexing gives the same result. - da[[0, 1], [1, 1]] == da[[0, 1]][:, [1, 1]] - - -In order to make more advanced indexing, you can supply -:py:meth:`~xarray.DataArray` as indexers. -If :py:meth:`~xarray.DataArray` is provided as indexers, the dimension of the -resultant array is determined by the indexers' dimension names, - -.. ipython:: python - - ind_x = xr.DataArray([0, 1], dims=['x']) - ind_y = xr.DataArray([0, 1], dims=['y']) - da[ind_x, ind_y] # orthogonal indexing - da[ind_x, ind_x] # vectorized indexing - -If you just provide slices or sequences, which do not have named-dimensions, -they will be understood as the same dimension which is indexed along. - -.. ipython:: python - - # Because [0, 1] is used to index along dimension 'x', - # it is assumed to have dimension 'x' - da[[0, 1], ind_x] - - -Furthermore, you can use multi-dimensional :py:meth:`~xarray.DataArray` -as indexers, - -.. ipython:: python - - ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) - da[ind] - -To summarize, our indexing rule is based on our :ref:`compute.broadcasting` -scheme. - +Nearest neighbor lookups +------------------------ -These advanced indexing also works with ``isel``, ``loc``, and ``sel``. +The label based selection methods :py:meth:`~xarray.Dataset.sel`, +:py:meth:`~xarray.Dataset.reindex` and :py:meth:`~xarray.Dataset.reindex_like` all +support ``method`` and ``tolerance`` keyword argument. The method parameter allows for +enabling nearest neighbor (inexact) lookups by use of the methods ``'pad'``, +``'backfill'`` or ``'nearest'``: .. ipython:: python - ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) - da.isel(y=ind) # same to da[:, ind] - - ind = xr.DataArray([['a', 'b'], ['b', 'a']], dims=['a', 'b']) - da.loc[:, ind] # same to da.sel(y=ind) + data = xr.DataArray([1, 2, 3], [('x', [0, 1, 2])]) + data.sel(x=[1.1, 1.9], method='nearest') + data.sel(x=0.1, method='backfill') + data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') - -Assigning values ----------------- - -As similar to ``numpy's nd-array``, the value assignment behaves differently -depending on whether `basic- or advanced-indexing`__. - -__ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays - -1. Basic indexing - Indexer consists of slice, ellipse, or integer. Not a sequences of integer. - By basic indexing, you can select a subset of an array to assign values. +Tolerance limits the maximum distance for valid matches with an inexact lookup: .. ipython:: python - da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], - coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) - da - da[0, 0] = -1 # Assign -1 to one element - da - - da[0] = -2 # The shape is different but broadcastable - da - - da.loc[:, 'a'] = -3 # assignment through label-based indexing is also possible - da - - -.. warning:: + data.reindex(x=[1.1, 1.5], method='nearest', tolerance=0.2) - Do not try to assign values when using any of the indexing methods ``isel`` - or ``sel``:: - - # DO NOT do this - arr.isel(space=0) = 0 +Using ``method='nearest'`` or a scalar argument with ``.sel()`` requires pandas +version 0.16 or newer. Using ``tolerance`` requries pandas version 0.17 or newer. -2. Advanced indexing - If the underlying indexing is advanced, indexing returns a copy of the - array not a view. - In this case, the method will fail, and when it fails, **it will fail - silently**. +The method parameter is not yet supported if any of the arguments +to ``.sel()`` is a ``slice`` object: -.. _pointwise indexing: +.. ipython:: + :verbatim: -Pointwise indexing ------------------- + In [1]: data.sel(x=slice(1, 3), method='nearest') + NotImplementedError -xarray pointwise indexing supports the indexing along multiple labeled dimensions -using list-like objects. While :py:meth:`~xarray.DataArray.isel` performs -orthogonal indexing, the :py:meth:`~xarray.DataArray.isel_points` method -provides similar numpy indexing behavior as if you were using multiple -lists to index an array (e.g. ``arr[[0, 1], [0, 1]]`` ): +However, you don't need to use ``method`` to do inexact slicing. Slicing +already returns all values inside the range (inclusive), as long as the index +labels are monotonic increasing: .. ipython:: python - # index by integer array indices - da = xr.DataArray(np.arange(56).reshape((7, 8)), dims=['x', 'y']) - da - da.isel_points(x=[0, 1, 6], y=[0, 1, 0]) + data.sel(x=slice(0.9, 3.1)) -There is also :py:meth:`~xarray.DataArray.sel_points`, which analogously -allows you to do point-wise indexing by label: +Indexing axes with monotonic decreasing labels also works, as long as the +``slice`` or ``.loc`` arguments are also decreasing: .. ipython:: python - times = pd.to_datetime(['2000-01-03', '2000-01-02', '2000-01-01']) - arr.sel_points(space=['IA', 'IL', 'IN'], time=times) - -The equivalent pandas method to ``sel_points`` is -:py:meth:`~pandas.DataFrame.lookup`. + reversed_data = data[::-1] + reversed_data.loc[3.1:0.9] Dataset indexing ---------------- @@ -294,8 +214,6 @@ simultaneously, returning a new dataset: ds = arr.to_dataset(name='foo') ds.isel(space=[0], time=[0]) ds.sel(time='2000-01-01') - ds2 = da.to_dataset(name='bar') - ds2.isel_points(x=[0, 1, 6], y=[0, 1, 0], dim='points') Positional indexing on a dataset is not supported because the ordering of dimensions in a dataset is somewhat ambiguous (it can vary between different @@ -324,55 +242,6 @@ index labels along a dimension dropped: .. _nearest neighbor lookups: -Nearest neighbor lookups ------------------------- - -The label based selection methods :py:meth:`~xarray.Dataset.sel`, -:py:meth:`~xarray.Dataset.reindex` and :py:meth:`~xarray.Dataset.reindex_like` all -support ``method`` and ``tolerance`` keyword argument. The method parameter allows for -enabling nearest neighbor (inexact) lookups by use of the methods ``'pad'``, -``'backfill'`` or ``'nearest'``: - -.. ipython:: python - - data = xr.DataArray([1, 2, 3], [('x', [0, 1, 2])]) - data.sel(x=[1.1, 1.9], method='nearest') - data.sel(x=0.1, method='backfill') - data.reindex(x=[0.5, 1, 1.5, 2, 2.5], method='pad') - -Tolerance limits the maximum distance for valid matches with an inexact lookup: - -.. ipython:: python - - data.reindex(x=[1.1, 1.5], method='nearest', tolerance=0.2) - -Using ``method='nearest'`` or a scalar argument with ``.sel()`` requires pandas -version 0.16 or newer. Using ``tolerance`` requries pandas version 0.17 or newer. - -The method parameter is not yet supported if any of the arguments -to ``.sel()`` is a ``slice`` object: - -.. ipython:: - :verbatim: - - In [1]: data.sel(x=slice(1, 3), method='nearest') - NotImplementedError - -However, you don't need to use ``method`` to do inexact slicing. Slicing -already returns all values inside the range (inclusive), as long as the index -labels are monotonic increasing: - -.. ipython:: python - - data.sel(x=slice(0.9, 3.1)) - -Indexing axes with monotonic decreasing labels also works, as long as the -``slice`` or ``.loc`` arguments are also decreasing: - -.. ipython:: python - - reversed_data = data[::-1] - reversed_data.loc[3.1:0.9] .. _masking with where: @@ -467,66 +336,125 @@ labels of the 1st and 2nd dimension, respectively. You must specify all dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example above, ``mda.loc[{'one': 'a', 'two': 0}, :]`` or ``mda.loc[('a', 0), ...]``. -Multi-dimensional indexing --------------------------- -xarray does not yet support efficient routines for generalized multi-dimensional -indexing or regridding. However, we are definitely interested in adding support -for this in the future (see :issue:`475` for the ongoing discussion). +.. _advanced_indexing: -.. _copies vs views: +Basic and Advanced Indexing +--------------------------- -Copies vs. views ----------------- +As similar to numpy's nd-array, xarray supports two types of indexing, +`basic- and advanced-indexing`__. +However, our indexing rule differs from numpy's nd-array. -Whether array indexing returns a view or a copy of the underlying -data depends on the nature of the labels. For positional (integer) -indexing, xarray follows the same rules as NumPy: +.. __ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html -* Positional indexing with only integers and slices returns a view. -* Positional indexing with arrays or lists returns a copy. -The rules for label based indexing are more complex: +Our indexing is basically orthogonal, i.e. +if you pass multiple integer sequences to an array, they work independently +along each dimension (similar to the way vector subscripts work in fortran). -* Label-based indexing with only slices returns a view. -* Label-based indexing with arrays returns a copy. -* Label-based indexing with scalars returns a view or a copy, depending - upon if the corresponding positional indexer can be represented as an - integer or a slice object. The exact rules are determined by pandas. +.. ipython:: python -Whether data is a copy or a view is more predictable in xarray than in pandas, so -unlike pandas, xarray does not produce `SettingWithCopy warnings`_. However, you -should still avoid assignment with chained indexing. + da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da + da[[0, 1], [1, 1]] + # Sequential indexing gives the same result. + da[[0, 1], [1, 1]] == da[[0, 1]][:, [1, 1]] -.. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy +In order to make more advanced indexing, you can supply +:py:meth:`~xarray.DataArray` as indexers. +In this case, the dimension of the resultant array is determined +by the indexers' dimension names, + +.. ipython:: python + + ind_x = xr.DataArray([0, 1], dims=['x']) + ind_y = xr.DataArray([0, 1], dims=['y']) + da[ind_x, ind_y] # orthogonal indexing + da[ind_x, ind_x] # vectorized indexing + +If you just provide slices or sequences, which do not have named-dimensions, +they will be understood as the same dimension which is indexed along. + +.. ipython:: python -Orthogonal (outer) vs. vectorized indexing ------------------------------------------- + # Because [0, 1] is used to index along dimension 'x', + # [0, 1] is assumed to have dimension 'x' + da[[0, 1], ind_x] -Indexing with xarray objects has one important difference from indexing numpy -arrays: you can only use one-dimensional arrays to index xarray objects, and -each indexer is applied "orthogonally" along independent axes, instead of -using numpy's broadcasting rules to vectorize indexers. This means you can do -indexing like this, which would require slightly more awkward syntax with -numpy arrays: + +Furthermore, you can use multi-dimensional :py:meth:`~xarray.DataArray` +as indexers, where the resultant array dimension is also determined by +indexers' dimension, .. ipython:: python - arr[arr['time.day'] > 1, arr['space'] != 'IL'] + ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) + da[ind] -This is a much simpler model than numpy's `advanced indexing`__. If you would -like to do advanced-style array indexing in xarray, you have several options: +To summarize, our indexing rule is based on our broadcasting scheme. +See :ref:`compute.broadcasting` for the detail. -* :ref:`pointwise indexing` -* :ref:`masking with where` -* Index the underlying NumPy array directly using ``.values``, e.g., -__ http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html +These advanced indexing also works with ``isel``, ``loc``, and ``sel``. .. ipython:: python - arr.values[arr.values > 0.5] + ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) + da.isel(y=ind) # same to da[:, ind] + + ind = xr.DataArray([['a', 'b'], ['b', 'a']], dims=['a', 'b']) + da.loc[:, ind] # same to da.sel(y=ind) + + +and also for Dataset + +.. ipython:: python + + ds2 = da.to_dataset(name='bar') + ds2.isel(x=xr.DataArray([0, 1, 2], dims=['points']), + y=xr.DataArray([0, 1, 0], dims=['points'])) + + +More advanced indexing +----------------------- + +The use of :py:meth:`~xarray.DataArray` as indexers enables very flexible indexing. +The following is an example of the pointwise indexing, + +.. ipython:: python + + # index by integer array indices + da = xr.DataArray(np.arange(56).reshape((7, 8)), dims=['x', 'y']) + da + da.isel(x=xr.DataArray([0, 1, 6], dims='z'), + y=xr.DataArray([0, 1, 0], dims='z')) + +where three elements at ``(ix, iy) = ((0, 0), (1, 1), (6, 0))`` are selected +and mapped along a new dimension ``z``. + +If you want to add a coordinate to the dimension ``z``, +you can supply a :py:meth:`~xarray.DataArray` with a coordinate as indexers, + +.. ipython:: python + + # z will have a coordinate + da.isel(x=xr.DataArray([0, 1, 6], dims='z', + coords={'z': ['a', 'b', 'c']}), + y=xr.DataArray([0, 1, 0], dims='z')) + + +Analogously, label-based pointwise-indexing is also possible by ``.sel`` method, + +.. ipython:: python + + times = xr.DataArray(pd.to_datetime(['2000-01-03', '2000-01-02', '2000-01-01']), + dims='new_time') + arr.sel(space=xr.DataArray(['IA', 'IL', 'IN'], dims=['new_time']), + time=times) + .. _align and reindex: @@ -636,3 +564,55 @@ labels: array array.get_index('x') + + +.. _assigning_values: + +Assigning values +---------------- + +Whether array indexing returns a view or a copy of the underlying +data depends on the nature of the labels. +When it returns a view, the value assignment is possible. +However if it returns a copy, the value assignment can fail, +and if it fails it fails *silently*. + +For positional (integer) +indexing, xarray follows the same rules as NumPy: + +* Positional indexing with only integers and slices returns a view. +* Positional indexing with arrays or lists returns a copy. + + +.. ipython:: python + + da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da + da[0, 0] = -1 # Assign -1 to one element + da + + da[0] = -2 # The shape is different but broadcastable + da + +The rules for label based indexing are more complex: + +* Label-based indexing with only slices returns a view. +* Label-based indexing with arrays returns a copy. +* Label-based indexing with scalars returns a view or a copy, depending + upon if the corresponding positional indexer can be represented as an + integer or a slice object. The exact rules are determined by pandas. + +Whether data is a copy or a view is more predictable in xarray than in pandas, so +unlike pandas, xarray does not produce `SettingWithCopy warnings`_. However, you +should still avoid assignment with chained indexing. + +.. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy + +.. warning:: + + Do not try to assign values when using any of the indexing methods ``isel`` + or ``sel``:: + + # DO NOT do this + arr.isel(space=0) = 0 From 1975f66ae0c1e56d458a1709fedf27922d9ec902 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 31 Aug 2017 23:08:15 +0900 Subject: [PATCH 082/113] Api.rst changed --- doc/api.rst | 4 ---- doc/indexing.rst | 39 +++++++++++++++++++-------------------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 433aa93c9de..50682ac3a0e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -105,8 +105,6 @@ Indexing Dataset.loc Dataset.isel Dataset.sel - Dataset.isel_points - Dataset.sel_points Dataset.squeeze Dataset.reindex Dataset.reindex_like @@ -247,8 +245,6 @@ Indexing DataArray.loc DataArray.isel DataArray.sel - DataArray.isel_points - DataArray.sel_points DataArray.squeeze DataArray.reindex DataArray.reindex_like diff --git a/doc/indexing.rst b/doc/indexing.rst index 9b4e7effffe..45cb91dd4f5 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -13,7 +13,7 @@ Indexing and selecting data The point of xarray is to introduce a numpy-ndarray-like multidimensional array object into a powerful pandas's flexible data handling scheme. -We provide several types (say, numpy-like and pandas-like, and more advanced) indexing functionalities. +We provide several (say, numpy-like, pandas-like, and more advanced type) indexing functionalities. The most basic way to access each element of xarray's multi-dimensional object is to use Python ``[obj]`` syntax, such as ``array[i, j]``, where ``i`` and ``j`` are both integers. @@ -83,7 +83,6 @@ fast. To do label based indexing, use the :py:attr:`~xarray.DataArray.loc` attri .. ipython:: python - # Coordinate 'time' arr.loc['2000-01-01':'2000-01-02', 'IA'] In this example, the selected is a subpart of the array @@ -105,9 +104,9 @@ Setting values with label based indexing is also supported: arr .. note:: - Like indexing in numpy ndarray __, - depending on whether indexing returns view or copies, setting value - fails. For the details of the value assignment, see :ref:`assigning_values`. + Like indexing in numpy `ndarray`__, + setting values could fail depending on whether indexing returns views or copies. + For the details of the value assignment, see :ref:`assigning_values`. __ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays @@ -152,6 +151,7 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ +.. _nearest neighbor lookups: Nearest neighbor lookups ------------------------ @@ -175,9 +175,6 @@ Tolerance limits the maximum distance for valid matches with an inexact lookup: data.reindex(x=[1.1, 1.5], method='nearest', tolerance=0.2) -Using ``method='nearest'`` or a scalar argument with ``.sel()`` requires pandas -version 0.16 or newer. Using ``tolerance`` requries pandas version 0.17 or newer. - The method parameter is not yet supported if any of the arguments to ``.sel()`` is a ``slice`` object: @@ -217,7 +214,7 @@ simultaneously, returning a new dataset: Positional indexing on a dataset is not supported because the ordering of dimensions in a dataset is somewhat ambiguous (it can vary between different -arrays). However, you can do normal indexing with labeled dimensions: +arrays). However, you can do normal indexing with dimension names: .. ipython:: python @@ -240,8 +237,6 @@ index labels along a dimension dropped: ``drop`` is both a ``Dataset`` and ``DataArray`` method. -.. _nearest neighbor lookups: - .. _masking with where: @@ -344,9 +339,9 @@ Basic and Advanced Indexing As similar to numpy's nd-array, xarray supports two types of indexing, `basic- and advanced-indexing`__. -However, our indexing rule differs from numpy's nd-array. +However, our indexing rule differs from numpy. -.. __ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html +__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html Our indexing is basically orthogonal, i.e. @@ -381,7 +376,7 @@ they will be understood as the same dimension which is indexed along. .. ipython:: python # Because [0, 1] is used to index along dimension 'x', - # [0, 1] is assumed to have dimension 'x' + # it is assumed to have dimension 'x' da[[0, 1], ind_x] @@ -394,7 +389,7 @@ indexers' dimension, ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) da[ind] -To summarize, our indexing rule is based on our broadcasting scheme. +To summarize, our advanced indexing is based on our broadcasting scheme. See :ref:`compute.broadcasting` for the detail. @@ -414,8 +409,13 @@ and also for Dataset .. ipython:: python ds2 = da.to_dataset(name='bar') - ds2.isel(x=xr.DataArray([0, 1, 2], dims=['points']), - y=xr.DataArray([0, 1, 0], dims=['points'])) + ds2.isel(x=xr.DataArray([0, 1, 2], dims=['points'])) + +.. note:: + This advanced indexing was newly added in v.0.10. + In the older version of xarray, dimensions of indexers are not used. + Special methods to realize some advanced indexing, + ``isel_points`` and ``sel_points`` are now deprecated. More advanced indexing @@ -435,12 +435,11 @@ The following is an example of the pointwise indexing, where three elements at ``(ix, iy) = ((0, 0), (1, 1), (6, 0))`` are selected and mapped along a new dimension ``z``. -If you want to add a coordinate to the dimension ``z``, -you can supply a :py:meth:`~xarray.DataArray` with a coordinate as indexers, +If you want to add a coordinate to the new dimension ``z``, +you can supply a :py:meth:`~xarray.DataArray` with a coordinate, .. ipython:: python - # z will have a coordinate da.isel(x=xr.DataArray([0, 1, 6], dims='z', coords={'z': ['a', 'b', 'c']}), y=xr.DataArray([0, 1, 0], dims='z')) From b49f813e49819c3cb7ae62da7579127945cc81b3 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Fri, 1 Sep 2017 00:38:56 +0900 Subject: [PATCH 083/113] Add link in whats-new --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 11829754920..75534a70599 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,7 +27,7 @@ Backward Incompatible Changes indexing, as well as vectorized indexing. Due to this change, existing uses of xarray objects to index other xarray objects will break in some cases. - See *** for the details. + See :ref:`indexing` for the details. (:issue:`1444`, :issue:`1436`, ) By `Keisuke Fujii `_ and `Stephan Hoyer `_. From 1fd6b3a3859dd827a75422866a75b93ce66cb1de Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Fri, 1 Sep 2017 00:40:55 +0900 Subject: [PATCH 084/113] Small format cleanup --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 75534a70599..bb87c412711 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,13 +22,13 @@ Backward Incompatible Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - xarray now supports vectorized indexing, where we consider the dimension of - indexer, e.g. `array.sel(x=ind)` with `ind.dims == ('y', )` . + indexer, e.g. ``array.sel(x=ind)`` with ``ind.dims == ('y', )`` . This enables us more advanced indexing, including outer indexing, diagonal indexing, as well as vectorized indexing. Due to this change, existing uses of xarray objects to index other xarray objects will break in some cases. - See :ref:`indexing` for the details. - (:issue:`1444`, :issue:`1436`, ) + See :ref:`indexing` for the details + (:issue:`1444`, :issue:`1436`, ). By `Keisuke Fujii `_ and `Stephan Hoyer `_. From 46dd7c7cd0a9980572bcfd8f8362d5fb32c7444f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 31 Aug 2017 16:41:36 -0700 Subject: [PATCH 085/113] allow positional indexing with unsigned integer types --- doc/whats-new.rst | 4 ++++ xarray/core/indexing.py | 2 +- xarray/tests/test_indexing.py | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d74ebc05391..39cb078d559 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -98,6 +98,10 @@ Bug fixes objects with data stored as ``dask`` arrays (:issue:`1529`). By `Joe Hamman `_. +- Fix positional indexing to allow the use of unsigned integers (:issue:`1405`). + By `Joe Hamman `_ and + `Gerrit Holl Date: Fri, 1 Sep 2017 21:34:35 +0900 Subject: [PATCH 086/113] Catch up to the previous merge. --- xarray/core/dataset.py | 4 +++- xarray/core/nputils.py | 7 +++---- xarray/core/variable.py | 5 ++--- xarray/tests/test_indexing.py | 3 +-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4c351a2823d..c82dd420a83 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1289,7 +1289,9 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): ) # attach indexer's coordinate to pos_indexers for k, v in new_coords.items(): - pos_indexers[k] = DataArray(pos_indexers[k], coords=v) + if isinstance(pos_indexers[k], Variable): + pos_indexers[k] = DataArray(pos_indexers[k], coords=v, + dims=pos_indexers[k].dims) result = self.isel(drop=drop, **pos_indexers) return result._replace_indexes(new_indexes) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 35fc79285e6..873478875f5 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -5,8 +5,6 @@ import pandas as pd import warnings -from .npcompat import moveaxis - def _validate_axis(data, axis): ndim = data.ndim @@ -130,9 +128,10 @@ def __init__(self, array): def __getitem__(self, key): mixed_positions, vindex_positions = _advanced_indexer_subspaces(key) - return moveaxis(self._array[key], mixed_positions, vindex_positions) + return np.moveaxis(self._array[key], mixed_positions, vindex_positions) def __setitem__(self, key, value): """Value must have dimensionality matching the key.""" mixed_positions, vindex_positions = _advanced_indexer_subspaces(key) - self._array[key] = moveaxis(value, vindex_positions, mixed_positions) + self._array[key] = np.moveaxis(value, vindex_positions, + mixed_positions) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 172dddf662f..437384dcdb7 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -17,7 +17,6 @@ from . import nputils from . import ops from . import utils -from .npcompat import moveaxis from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) from .indexing import (PandasIndexAdapter, xarray_indexable, BasicIndexer, @@ -562,7 +561,7 @@ def __getitem__(self, key): dims, index_tuple, new_order = self._broadcast_indexes(key) data = self._indexable_data[index_tuple] if new_order: - data = moveaxis(data, range(len(new_order)), new_order) + data = np.moveaxis(data, range(len(new_order)), new_order) assert getattr(data, 'ndim', 0) == len(dims), (data.ndim, len(dims)) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) @@ -588,7 +587,7 @@ def __setitem__(self, key, value): value = value[(len(dims) - value.ndim) * (np.newaxis,) + (Ellipsis,)] - value = moveaxis(value, new_order, range(len(new_order))) + value = np.moveaxis(value, new_order, range(len(new_order))) self._indexable_data[index_tuple] = value diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 34a0060309d..8e4b88011bc 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -9,7 +9,6 @@ from xarray import Dataset, DataArray, Variable from xarray.core import indexing from xarray.core import nputils -from xarray.core.npcompat import moveaxis from . import TestCase, ReturnItem @@ -248,7 +247,7 @@ def nonzero(x): expected_data = nputils.NumpyVIndexAdapter(v.data)[expected] if new_order: old_order = range(len(new_order)) - expected_data = moveaxis(expected_data, old_order, new_order) + expected_data = np.moveaxis(expected_data, old_order, new_order) outer_index = indexing.OuterIndexer( (nonzero(i), nonzero(j), nonzero(k))) From 71049648c4ef6da6652c0e3fb4bbf38fb6abdd27 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Fri, 1 Sep 2017 22:28:55 +0900 Subject: [PATCH 087/113] workaround for daskarray with uint indexer. --- xarray/core/indexing.py | 13 +++++++++---- xarray/tests/test_dataarray.py | 13 +++++++++++++ xarray/tests/test_indexing.py | 5 ----- xarray/tests/test_variable.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3031c67e545..80a134928d0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -457,7 +457,7 @@ def _outer_to_numpy_indexer(key, shape): else: # np.ndarray or slice if isinstance(k, slice): k = np.arange(*k.indices(size)) - assert k.dtype.kind == 'i' + assert k.dtype.kind in {'i', 'u'} shape = [(1,) * i_dim + (k.size, ) + (1,) * (n_dim - i_dim - 1)] new_key.append(k.reshape(*shape)) @@ -510,13 +510,18 @@ def __init__(self, array): self.array = array def __getitem__(self, key): + def to_int_tuple(key): + # workaround for uint64 indexer (GH:1406) + return tuple([k.astype(int) if isinstance(k, np.ndarray) + else k for k in key]) + if isinstance(key, BasicIndexer): - return self.array[tuple(key)] + return self.array[to_int_tuple(key)] elif isinstance(key, VectorizedIndexer): - return self.array.vindex[tuple(key)] + return self.array.vindex[to_int_tuple(tuple(key))] else: assert isinstance(key, OuterIndexer) - key = tuple(key) + key = to_int_tuple(tuple(key)) try: return self.array[key] except NotImplementedError: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index f4ec797166b..9a06266447d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -552,6 +552,19 @@ def test_isel(self): self.assertDataArrayIdentical(self.dv[:3, :5], self.dv.isel(x=slice(3), y=slice(5))) + def test_isel_types(self): + # regression test for #1405 + da = DataArray([1, 2, 3], dims='x') + # uint64 + self.assertDataArrayIdentical(da.isel(x=np.array([0], dtype="uint64")), + da.isel(x=np.array([0]))) + # uint32 + self.assertDataArrayIdentical(da.isel(x=np.array([0], dtype="uint32")), + da.isel(x=np.array([0]))) + # int64 + self.assertDataArrayIdentical(da.isel(x=np.array([0], dtype="int64")), + da.isel(x=np.array([0]))) + def test_isel_fancy(self): shape = (10, 7, 6) np_array = np.random.random(shape) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index df9f846fc50..8e4b88011bc 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -126,11 +126,6 @@ def test_indexer(data, x, expected_pos, expected_idx=None): [True, True, True, True, False, False, False, False], pd.MultiIndex.from_product([[1, 2], [-1, -2]])) - def test_uint_indexer(self): - # regression test for #1405 - da = DataArray([1, 2, 3], dims='x') - da.isel(x=np.array([0], dtype="uint64")) - class TestLazyArray(TestCase): def test_slice_slice(self): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c3b101db1e2..142b1608fed 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -539,6 +539,35 @@ def test_getitem_advanced(self): expected = v[dict(y=2)] self.assertArrayEqual(v_new, expected) + def test_getitem_uint_1d(self): + # regression test for #1405 + v = self.cls(['x'], [0, 1, 2]) + v_data = v.compute().data + + v_new = v[np.array([0])] + self.assertArrayEqual(v_new, v_data[0]) + v_new = v[np.array([0], dtype="uint64")] + self.assertArrayEqual(v_new, v_data[0]) + + def test_getitem_uint(self): + # regression test for #1405 + v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) + v_data = v.compute().data + + v_new = v[np.array([0])] + self.assertArrayEqual(v_new, v_data[[0], :]) + v_new = v[np.array([0], dtype="uint64")] + self.assertArrayEqual(v_new, v_data[[0], :]) + + def test_getitem_0d_array(self): + # make sure 0d-np.array can be used as an indexer + v = self.cls(['x'], [0, 1, 2]) + v_data = v.compute().data + + ind = np.array(0, dtype='int') # This is 0d-array + v_new = v[np.array([0])[0]] + self.assertArrayEqual(v_new, v_data[0]) + def test_getitem_fancy(self): v = self.cls(['x', 'y'], [[0, 1, 2], [3, 4, 5]]) v_data = v.compute().data @@ -1497,6 +1526,10 @@ def test_getitem_advanced(self): def test_getitem_fancy(self): super(TestIndexVariable, self).test_getitem_fancy() + @pytest.mark.xfail + def test_getitem_uint(self): + super(TestIndexVariable, self).test_getitem_fancy() + class TestAsCompatibleData(TestCase): def test_unchanged_types(self): From 173968bf787c0d802d6f8c2fb9acb945d215ca38 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 2 Sep 2017 22:51:04 +0900 Subject: [PATCH 088/113] Add a section about assignment, full indexing rules. --- doc/indexing.rst | 278 ++++++++++++++++++++-------------- doc/whats-new.rst | 4 +- xarray/core/indexing.py | 1 + xarray/tests/test_variable.py | 15 +- 4 files changed, 185 insertions(+), 113 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 45cb91dd4f5..471133814f4 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -46,11 +46,9 @@ below and summarized in this table: | | | ``arr.loc[dict(space='IA')]`` | ``ds.loc[dict(space='IA')]`` | +------------------+--------------+---------------------------------+--------------------------------+ - More advanced indexing is also possible for all the methods by supplying :py:class:`~xarray.DataArray` objects as indexer. -See :ref:`advanced_indexing` for the details. - +See :ref:`vectorized_indexing` for the details. Positional indexing @@ -75,7 +73,7 @@ Attributes are persisted in all indexing operations. Positional indexing deviates from the NumPy when indexing with multiple arrays like ``arr[[0, 1], [0, 1]]``, as described in - :ref:`advanced_indexing`. + :ref:`vectorized_indexing`. xarray also supports label-based indexing, just like pandas. Because we use a :py:class:`pandas.Index` under the hood, label based indexing is very @@ -103,13 +101,6 @@ Setting values with label based indexing is also supported: arr.loc['2000-01-01', ['IL', 'IN']] = -10 arr -.. note:: - Like indexing in numpy `ndarray`__, - setting values could fail depending on whether indexing returns views or copies. - For the details of the value assignment, see :ref:`assigning_values`. - - __ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays - Indexing with dimension names ----------------------------- @@ -151,6 +142,7 @@ Python :py:func:`slice` objects or 1-dimensional arrays. __ http://legacy.python.org/dev/peps/pep-0472/ + .. _nearest neighbor lookups: Nearest neighbor lookups @@ -200,6 +192,7 @@ Indexing axes with monotonic decreasing labels also works, as long as the reversed_data = data[::-1] reversed_data.loc[3.1:0.9] + Dataset indexing ---------------- @@ -272,79 +265,17 @@ elements that are fully masked: arr2.where(arr2.y < 2, drop=True) -.. _multi-level indexing: - -Multi-level indexing --------------------- - -Just like pandas, advanced indexing on multi-level indexes is possible with -``loc`` and ``sel``. You can slice a multi-index by providing multiple indexers, -i.e., a tuple of slices, labels, list of labels, or any selector allowed by -pandas: - -.. ipython:: python - - midx = pd.MultiIndex.from_product([list('abc'), [0, 1]], - names=('one', 'two')) - mda = xr.DataArray(np.random.rand(6, 3), - [('x', midx), ('y', range(3))]) - mda - mda.sel(x=(list('ab'), [0])) - -You can also select multiple elements by providing a list of labels or tuples or -a slice of tuples: - -.. ipython:: python - - mda.sel(x=[('a', 0), ('b', 1)]) - -Additionally, xarray supports dictionaries: - -.. ipython:: python - - mda.sel(x={'one': 'a', 'two': 0}) - -For convenience, ``sel`` also accepts multi-index levels directly -as keyword arguments: - -.. ipython:: python - - mda.sel(one='a', two=0) - -Note that using ``sel`` it is not possible to mix a dimension -indexer with level indexers for that dimension -(e.g., ``mda.sel(x={'one': 'a'}, two=0)`` will raise a ``ValueError``). - -Like pandas, xarray handles partial selection on multi-index (level drop). -As shown below, it also renames the dimension / coordinate when the -multi-index is reduced to a single index. - -.. ipython:: python - - mda.loc[{'one': 'a'}, ...] - -Unlike pandas, xarray does not guess whether you provide index levels or -dimensions when using ``loc`` in some ambiguous cases. For example, for -``mda.loc[{'one': 'a', 'two': 0}]`` and ``mda.loc['a', 0]`` xarray -always interprets ('one', 'two') and ('a', 0) as the names and -labels of the 1st and 2nd dimension, respectively. You must specify all -dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example -above, ``mda.loc[{'one': 'a', 'two': 0}, :]`` or ``mda.loc[('a', 0), ...]``. - - -.. _advanced_indexing: -Basic and Advanced Indexing ---------------------------- +.. _vectorized_indexing: -As similar to numpy's nd-array, xarray supports two types of indexing, -`basic- and advanced-indexing`__. -However, our indexing rule differs from numpy. - -__ https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html +Vectorized Indexing +------------------- +xarray supports many types of indexing with a `vectorized` manner. -Our indexing is basically orthogonal, i.e. +If you provide an integer, slice, or unlabeled array (array without dimension names, such as ``np.ndarray``, ``list``, but not :py:meth:`~xarray.DataArray` or :py:meth:`~xarray.Variable`) +our indexing is basically orthogonal. +For example, if you pass multiple integer sequences to an array, they work independently along each dimension (similar to the way vector subscripts work in fortran). @@ -357,7 +288,6 @@ along each dimension (similar to the way vector subscripts work in fortran). # Sequential indexing gives the same result. da[[0, 1], [1, 1]] == da[[0, 1]][:, [1, 1]] - In order to make more advanced indexing, you can supply :py:meth:`~xarray.DataArray` as indexers. In this case, the dimension of the resultant array is determined @@ -370,8 +300,9 @@ by the indexers' dimension names, da[ind_x, ind_y] # orthogonal indexing da[ind_x, ind_x] # vectorized indexing -If you just provide slices or sequences, which do not have named-dimensions, -they will be understood as the same dimension which is indexed along. +Slices or sequences, which do not have named-dimensions, +as a manner of fact, +will be understood as the same dimension which is indexed along. .. ipython:: python @@ -379,7 +310,6 @@ they will be understood as the same dimension which is indexed along. # it is assumed to have dimension 'x' da[[0, 1], ind_x] - Furthermore, you can use multi-dimensional :py:meth:`~xarray.DataArray` as indexers, where the resultant array dimension is also determined by indexers' dimension, @@ -390,10 +320,9 @@ indexers' dimension, da[ind] To summarize, our advanced indexing is based on our broadcasting scheme. -See :ref:`compute.broadcasting` for the detail. - +See :ref:`xarray_indexing_rules` for the full list of our indexing rule. -These advanced indexing also works with ``isel``, ``loc``, and ``sel``. +These vectorized indexing also works with ``isel``, ``loc``, and ``sel``. .. ipython:: python @@ -416,7 +345,73 @@ and also for Dataset In the older version of xarray, dimensions of indexers are not used. Special methods to realize some advanced indexing, ``isel_points`` and ``sel_points`` are now deprecated. + See :ref:`more_advanced_indexing` for their alternative. + + +.. _assigning_values: + +Assigning values with indexing +------------------------------ + +Vectorized indexing can be used to assign values to xarray object. + +.. ipython:: python + + da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], + coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da + da[0] = -1 # assignment with broadcasting + da + + ind_x = xr.DataArray([0, 1], dims=['x']) + ind_y = xr.DataArray([0, 1], dims=['y']) + da[ind_x, ind_y] = -2 # assign -2 to (ix, iy) = (0, 0) and (1, 1) + da + + da[ind_x, ind_y] += 100 # increment is also possible + da + +As like numpy ndarray, value assignment sometimes works differently from what one may expect. + +.. ipython:: python + + da = xr.DataArray([0, 1, 2, 3], dims=['x']) + ind = xr.DataArray([0, 0, 0], dims=['x']) + da[ind] -= 1 + da + +Where the 0th element will be subtracted 1 only once. +This is because ``v[0] = v[0] - 1`` is called three times, rather than +``v[0] = v[0] - 1 - 1 - 1``. +See `Assigning values to indexed arrays`__ for the details. +__ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-to-indexed-arrays + + +.. note:: + Dask backend does not yet support value assignment + (see :ref:`dask` for the details). + + +.. warning:: + + Do not try to assign values when using any of the indexing methods ``isel`` + or ``sel``:: + + # DO NOT do this + arr.isel(space=0) = 0 + + Assigning values with the chained indexing using ``.sel`` or ``.isel`` fails silently. + + .. ipython:: python + + da = xr.DataArray([0, 1, 2, 3], dims=['x']) + # DO NOT do this + da.isel(x=[0, 1, 2])[1] = -1 + da + + +.. _more_advanced_indexing: More advanced indexing ----------------------- @@ -426,7 +421,6 @@ The following is an example of the pointwise indexing, .. ipython:: python - # index by integer array indices da = xr.DataArray(np.arange(56).reshape((7, 8)), dims=['x', 'y']) da da.isel(x=xr.DataArray([0, 1, 6], dims='z'), @@ -565,16 +559,13 @@ labels: array.get_index('x') -.. _assigning_values: +.. _copies_vs_views: -Assigning values +Copies vs. Views ---------------- Whether array indexing returns a view or a copy of the underlying data depends on the nature of the labels. -When it returns a view, the value assignment is possible. -However if it returns a copy, the value assignment can fail, -and if it fails it fails *silently*. For positional (integer) indexing, xarray follows the same rules as NumPy: @@ -582,18 +573,6 @@ indexing, xarray follows the same rules as NumPy: * Positional indexing with only integers and slices returns a view. * Positional indexing with arrays or lists returns a copy. - -.. ipython:: python - - da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], - coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) - da - da[0, 0] = -1 # Assign -1 to one element - da - - da[0] = -2 # The shape is different but broadcastable - da - The rules for label based indexing are more complex: * Label-based indexing with only slices returns a view. @@ -608,10 +587,89 @@ should still avoid assignment with chained indexing. .. _SettingWithCopy warnings: http://pandas.pydata.org/pandas-docs/stable/indexing.html#returning-a-view-versus-a-copy -.. warning:: - Do not try to assign values when using any of the indexing methods ``isel`` - or ``sel``:: +.. _multi-level indexing: + +Multi-level indexing +-------------------- + +Just like pandas, advanced indexing on multi-level indexes is possible with +``loc`` and ``sel``. You can slice a multi-index by providing multiple indexers, +i.e., a tuple of slices, labels, list of labels, or any selector allowed by +pandas: + +.. ipython:: python + + midx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) + mda = xr.DataArray(np.random.rand(6, 3), + [('x', midx), ('y', range(3))]) + mda + mda.sel(x=(list('ab'), [0])) + +You can also select multiple elements by providing a list of labels or tuples or +a slice of tuples: + +.. ipython:: python + + mda.sel(x=[('a', 0), ('b', 1)]) + +Additionally, xarray supports dictionaries: + +.. ipython:: python + + mda.sel(x={'one': 'a', 'two': 0}) + +For convenience, ``sel`` also accepts multi-index levels directly +as keyword arguments: + +.. ipython:: python + + mda.sel(one='a', two=0) + +Note that using ``sel`` it is not possible to mix a dimension +indexer with level indexers for that dimension +(e.g., ``mda.sel(x={'one': 'a'}, two=0)`` will raise a ``ValueError``). + +Like pandas, xarray handles partial selection on multi-index (level drop). +As shown below, it also renames the dimension / coordinate when the +multi-index is reduced to a single index. + +.. ipython:: python + + mda.loc[{'one': 'a'}, ...] + +Unlike pandas, xarray does not guess whether you provide index levels or +dimensions when using ``loc`` in some ambiguous cases. For example, for +``mda.loc[{'one': 'a', 'two': 0}]`` and ``mda.loc['a', 0]`` xarray +always interprets ('one', 'two') and ('a', 0) as the names and +labels of the 1st and 2nd dimension, respectively. You must specify all +dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example +above, ``mda.loc[{'one': 'a', 'two': 0}, :]`` or ``mda.loc[('a', 0), ...]``. + + +.. _xarray_indexing_rules: + +xarray indexing rules +--------------------- + +The detailed indexing scheme in xarray is as follows. +(Note that it is for the explanation purpose and the actual implementation is differ.) + +0. (Only for label based indexing.) Look up positional indexes along each dimension based on :py:class:`pandas.Index`. + +1. ``slice`` is converted to an array, such that ``np.arange(*slice.indices(...))``. + +2. Assume dimension names of array indexers without dimension, such as ``np.ndarray`` and ``list``, from the dimensions to be indexed along. For example, ``v.isel(x=[0, 1])`` is understood as ``v.isel(x=xr.DataArray([0, 1], dims=['x']))``. + +3. Broadcast all the indexers based on their dimension names (see :ref:`compute.broadcasting` for our name-based broadcasting). + +4. Index the object by the broadcasted indexers. + +5. If an indexer-DataArray has coordinates, attached them to the indexed object. + +.. note:: + + + There should not be a conflict between the coordinates of indexer- and indexed- DataArrays. In v.0.10.0, xarray raises ``FutureWarning`` if there is such a conflict, but in the next major release, it will raise an Error. - # DO NOT do this - arr.isel(space=0) = 0 + + Only 1-dimensional boolean array can be used as an indexer. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6e066a80c0e..bdab4e1fc20 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -16,7 +16,7 @@ What's New .. _whats-new.0.9.7: v0.10.0 (unreleased) -------------------- +-------------------- Backward Incompatible Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -27,7 +27,7 @@ Backward Incompatible Changes indexing, as well as vectorized indexing. Due to this change, existing uses of xarray objects to index other xarray objects will break in some cases. - See :ref:`indexing` for the details + See :ref:`vectorized_indexing` for the details (:issue:`1444`, :issue:`1436`, ). By `Keisuke Fujii `_ and `Stephan Hoyer `_. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 80a134928d0..448bf759334 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -512,6 +512,7 @@ def __init__(self, array): def __getitem__(self, key): def to_int_tuple(key): # workaround for uint64 indexer (GH:1406) + # TODO remove here after next dask releas (0.15.3) return tuple([k.astype(int) if isinstance(k, np.ndarray) else k for k in key]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 142b1608fed..5d91c0d5f05 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1397,7 +1397,20 @@ def test_setitem(self): ind = Variable(['a'], [0, 1]) v[dict(x=ind)] = Variable(['a', 'y'], np.ones((2, 3), dtype=int) * 10) self.assertArrayEqual(v[0], np.ones_like(v[0]) * 10) - self.assertArrayEqual(v[1], np.ones_like(v[0]) * 10) + self.assertArrayEqual(v[1], np.ones_like(v[1]) * 10) + assert v.dims == ('x', 'y') # dimension should not change + + # increment + v = Variable(['x', 'y'], np.arange(6).reshape(3, 2)) + ind = Variable(['a'], [0, 1]) + v[dict(x=ind)] += 1 + expected = Variable(['x', 'y'], [[1, 2], [3, 4], [4, 5]]) + self.assertVariableIdentical(v, expected) + + ind = Variable(['a'], [0, 0]) + v[dict(x=ind)] += 1 + expected = Variable(['x', 'y'], [[2, 3], [3, 4], [4, 5]]) + self.assertVariableIdentical(v, expected) @requires_dask From 7ad7d364b9d17adfd4302ceca90ee7c01908e928 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 4 Sep 2017 12:11:50 +0900 Subject: [PATCH 089/113] warning added for reindex for DataArray indexers. --- xarray/core/dataset.py | 11 +++++++++++ xarray/tests/test_dataset.py | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0b26fa18554..d531eefb78c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1582,6 +1582,17 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, indexers = utils.combine_pos_and_kw_args(indexers, kw_indexers, 'reindex') + from .dataarray import DataArray + import warnings + for dim, ind in indexers.items(): + if isinstance(ind, DataArray) and ind.dims != ('dims', ): + warnings.warn( + "Indexer has dimensions {0:s} that are different " + "from that to be indexed along {1:s}. " + "This will behave differently in the future.".format( + str(ind.dims), dim), + FutureWarning, stacklevel=3) + bad_dims = [d for d in indexers if d not in self.dims] if bad_dims: raise ValueError('invalid reindex dimensions: %s' % bad_dims) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 3876d430916..4c9d58003f7 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1481,6 +1481,25 @@ def test_reindex(self): actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) self.assertDatasetIdentical(expected, actual) + def test_reindex_warning(self): + import warnings + data = create_test_data() + + with pytest.warns(FutureWarning) as ws: + # DataArray with different dimension raises Future warning + ind = xr.DataArray([0.0, 1.0], dims=['new_dim'], name='ind') + data.reindex(dim2=ind) + assert any(["Indexer ind has dimensions new_dim that are" not in + str(w.message) for w in ws]) + + with pytest.warns(FutureWarning) as ws: + # Should not warn + ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') + data.reindex(dim2=ind) + assert all(["Indexer ind has dimensions new_dim that are" not in + str(w.message) for w in ws]) + warnings.warn('dummy', FutureWarning, stacklevel=3) + def test_reindex_variables_copied(self): data = create_test_data() reindexed_data = data.reindex(copy=False) From 91dd833ad844f17b07c5e329e3f09827eca907d5 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 4 Sep 2017 19:52:01 +0900 Subject: [PATCH 090/113] Move warning in alignment.reindex_variables. --- xarray/core/alignment.py | 11 +++++++++++ xarray/core/dataset.py | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 44ad5faf5e2..35a0d93979b 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -301,6 +301,9 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None, reindexed : OrderedDict Another dict, with the items in variables but replaced indexes. """ + from .dataarray import DataArray + import warnings + # build up indexers for assignment along each dimension to_indexers = {} from_indexers = {} @@ -354,6 +357,14 @@ def var_indexers(var, indexers): reindexed = OrderedDict() for dim, indexer in indexers.items(): + if isinstance(indexer, DataArray) and indexer.dims != ('dims', ): + warnings.warn( + "Indexer has dimensions {0:s} that are different " + "from that to be indexed along {1:s}. " + "This will behave differently in the future.".format( + str(indexer.dims), dim), + FutureWarning, stacklevel=3) + if dim in variables: var = variables[dim] args = (var.attrs, var.encoding) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d531eefb78c..0b26fa18554 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1582,17 +1582,6 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, indexers = utils.combine_pos_and_kw_args(indexers, kw_indexers, 'reindex') - from .dataarray import DataArray - import warnings - for dim, ind in indexers.items(): - if isinstance(ind, DataArray) and ind.dims != ('dims', ): - warnings.warn( - "Indexer has dimensions {0:s} that are different " - "from that to be indexed along {1:s}. " - "This will behave differently in the future.".format( - str(ind.dims), dim), - FutureWarning, stacklevel=3) - bad_dims = [d for d in indexers if d not in self.dims] if bad_dims: raise ValueError('invalid reindex dimensions: %s' % bad_dims) From 118a5d8f64b7c0673ed4b906b20d237739d6715e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 5 Sep 2017 21:27:53 +0900 Subject: [PATCH 091/113] + Change API to attach non-dimensional coordinates. + Remove warnings for the coordinate conflict. --- xarray/core/dataset.py | 35 ++++++++++++---------------------- xarray/tests/test_dataset.py | 37 +++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0b26fa18554..03d356351a9 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5,6 +5,7 @@ from collections import Mapping, defaultdict from distutils.version import LooseVersion from numbers import Number +import warnings import sys @@ -1100,7 +1101,6 @@ def _validate_indexers(self, indexers): """ Here we make sure + indexer has a valid keys + indexer is in a valid data type - + raise an Error for some confusing case. """ from .dataarray import DataArray @@ -1129,37 +1129,27 @@ def _get_indexers_coordinates(self, indexers): Returns an OrderedDict mapping from coordinate name to the coordinate variable. - Coordinates to be extracted and attached should satisfy - + Dimension coordinate of the indexers. - Non-dimension coordinate of the indexers are not attached. - + Only coordinate with a name different from any of sef.variables. - - If self already has the same name coordinate, we raise an ValueError. + Only coordinate with a name different from any of sef.variables will + be attached. """ from .dataarray import DataArray - import warnings coord_list = [] for k, v in indexers.items(): if isinstance(v, DataArray): - coords = {d: v.coords[d].variable for d in v.dims - if d in v.coords} - if v.dtype.kind == 'b' and v.dims[0] in coords: + if v.dtype.kind == 'b': + if v.ndim != 1: # we only support 1-d boolean array + raise ValueError( + '{0:d}d-boolean array is used for indexing. ' + 'Only 1d-array is supported for boolean ' + 'indexing'.format(v.ndim)) # Make sure in case of boolean DataArray, its - # coordinate should be also indexed. - assert v.ndim == 1 # we only support 1-d boolean array - coords[v.dims[0]] = coords[v.dims[0]][v.variable] + # coordinate is also indexed. + v = v[v.values.nonzero()[0]] + coords = {d: v.coords[d].variable for d in v.coords} for k, vc in self.variables.items(): if k in coords and not vc[v.values].equals(coords[k]): - # TODO raise an Error in the next release - warnings.warn( - "Indexer's coordiante {0:s} conflicts with the " - "exisiting coordinate. This will raise an error " - "in the next release. " - "Use `.isel({0:s}=ind.drop(\'{0:s}\'))` to " - "index safely.".format(k), - FutureWarning, stacklevel=3) del coords[k] coord_list.append(coords) @@ -1479,7 +1469,6 @@ def sel_points(self, dim='points', method=None, tolerance=None, Dataset.isel_points DataArray.sel_points """ - import warnings warnings.warn('Dataset.sel_points is deprecated: use Dataset.sel()' 'instead', DeprecationWarning, stacklevel=2) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 4c9d58003f7..a0b226f9907 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -993,11 +993,6 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da) self.assertDataArrayIdentical(actual['dim2'], data['dim2'].isel(dim2=np.arange(1, 4))) - # make sure the coordinate confliction raises a warning - with pytest.warns(FutureWarning) as ws: - actual = data.isel(dim2=indexing_da) - assert any(["Indexer's coordiante dim2 conflicts" in str(w.message) - for w in ws]) # isel for the coordinate. Should not attach the coordinate actual = data['dim2'].isel(dim2=indexing_da) @@ -1009,18 +1004,26 @@ def test_isel_dataarray(self): coords={'dim2': data['dim2'].values[1:4]}) self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) - with pytest.warns(FutureWarning) as ws: - actual = data.isel(dim2=indexing_da) - # does not warn - assert all(["Indexer's coordiante dim2 conflicts" not in - str(w.message) for w in ws]) - warnings.warn('dummy', FutureWarning, stacklevel=3) - # boolean data array with coordinate with the same name indexing_da = (indexing_da < 3) actual = data.isel(dim2=indexing_da) self.assertDataArrayIdentical(actual['dim2'], data['dim2'][:2]) + # boolean data array with non-dimensioncoordinate + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': data['dim2'].values[1:4], + 'non_dim': (('dim2', ), [0, 1, 4]), + 'non_dim2': 0}) + indexing_da = (indexing_da < 3) + actual = data.isel(dim2=indexing_da) + self.assertDataArrayIdentical( + actual['dim2'].drop('non_dim').drop('non_dim2').drop('non_dim3'), + data['dim2'][:2]) + self.assertDataArrayIdentical( + actual['non_dim'], indexing_da['non_dim'][:2]) + self.assertDataArrayIdentical( + actual['non_dim2'], indexing_da['non_dim2']) + # boolean data array with coordinate with the different name indexing_da = DataArray(np.arange(1, 4), dims=['new_dim'], coords={'new_dim': np.random.randn(3)}) @@ -1030,14 +1033,14 @@ def test_isel_dataarray(self): self.assertDataArrayIdentical(actual['new_dim'].drop('dim2'), indexing_da['new_dim'][:2]) - # non-dimension coordinate will be ignored + # non-dimension coordinate will be also attached indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': np.random.randn(3), 'non_dim': (('dim2', ), np.random.randn(3))}) actual = data.isel(dim2=indexing_da) - assert 'non_dim' not in actual - assert 'non_dim' not in actual.coords + assert 'non_dim' in actual + assert 'non_dim' in actual.coords # indexing with DataArray with drop=True indexing_da = DataArray(np.arange(1, 4), dims=['a'], @@ -1049,9 +1052,9 @@ def test_isel_dataarray(self): # Index by a scalar DataArray indexing_da = DataArray(3, dims=[], coords={'station': 2}) actual = data.isel(dim2=indexing_da) - assert 'station' not in actual + assert 'station' in actual actual = data.isel(dim2=indexing_da['station']) - assert 'station' not in actual + assert 'station' in actual def test_sel(self): data = create_test_data() From dc9f8a66b5383f51400fb6052087ce7375734f4f Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 5 Sep 2017 21:41:09 +0900 Subject: [PATCH 092/113] Some clean up. Fix error in test_reindex_warning --- xarray/core/alignment.py | 10 +++++----- xarray/core/dataset.py | 27 ++++++++++----------------- xarray/core/indexing.py | 25 ++++++++++++------------- xarray/core/nputils.py | 5 +---- xarray/core/variable.py | 6 +++--- xarray/tests/test_dataset.py | 24 ++++++++++++++++-------- 6 files changed, 47 insertions(+), 50 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 35a0d93979b..e566fe699d3 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -4,13 +4,14 @@ import functools import operator from collections import defaultdict +import warnings import numpy as np from . import duck_array_ops from . import dtypes from . import utils -from .indexing import get_indexer +from .indexing import get_indexer_nd from .pycompat import iteritems, OrderedDict, suppress from .utils import is_full_slice, is_dict_like from .variable import Variable, IndexVariable @@ -302,7 +303,6 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None, Another dict, with the items in variables but replaced indexes. """ from .dataarray import DataArray - import warnings # build up indexers for assignment along each dimension to_indexers = {} @@ -317,10 +317,10 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None, raise ValueError( 'cannot reindex or align along dimension %r because the ' 'index has duplicate values' % name) - indexer = get_indexer(index, target, method, tolerance) + indexer = get_indexer_nd(index, target, method, tolerance) new_sizes[name] = len(target) - # Note pandas uses negative values from get_indexer to signify + # Note pandas uses negative values from get_indexer_nd to signify # values that are missing in the index # The non-negative values thus indicate the non-missing values to_indexers[name] = indexer >= 0 @@ -357,7 +357,7 @@ def var_indexers(var, indexers): reindexed = OrderedDict() for dim, indexer in indexers.items(): - if isinstance(indexer, DataArray) and indexer.dims != ('dims', ): + if isinstance(indexer, DataArray) and indexer.dims != (dim, ): warnings.warn( "Indexer has dimensions {0:s} that are different " "from that to be indexed along {1:s}. " diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 03d356351a9..cd29e8973ae 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1195,7 +1195,6 @@ def isel(self, drop=False, **indexers): indexers_list = self._validate_indexers(indexers) coord_vars = self._get_indexers_coordinates(indexers) - coord_names = set(self._coord_names) | set(coord_vars) variables = OrderedDict() for name, var in iteritems(self._variables): @@ -1204,11 +1203,10 @@ def isel(self, drop=False, **indexers): if not (drop and name in var_indexers): variables[name] = new_var - # attatch / overwrite coordinate in indexers - for k, v in coord_vars.items(): - variables[k] = v + # attach coordinate in indexers + variables.update(coord_vars) - coord_names = coord_names & set(variables) + coord_names = set(variables) & set(self._coord_names) | set(coord_vars) return self._replace_vars_and_dims(variables, coord_names=coord_names) def sel(self, method=None, tolerance=None, drop=False, **indexers): @@ -1263,26 +1261,21 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): See Also -------- Dataset.isel - Dataset.sel_points - Dataset.isel_points DataArray.sel """ from .dataarray import DataArray - new_coords = {k: v._coords for k, v in indexers.items() - if isinstance(v, DataArray)} - - indexers = {k: v.variable if isinstance(v, DataArray) else v - for k, v in indexers.items()} + v_indexers = {k: v.variable if isinstance(v, DataArray) else v + for k, v in indexers.items()} pos_indexers, new_indexes = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance + self, v_indexers, method=method, tolerance=tolerance ) # attach indexer's coordinate to pos_indexers - for k, v in new_coords.items(): - if isinstance(pos_indexers[k], Variable): - pos_indexers[k] = DataArray(pos_indexers[k], coords=v, - dims=pos_indexers[k].dims) + for k, v in indexers.items(): + if isinstance(v, DataArray): + pos_indexers[k] = DataArray(pos_indexers[k], + coords=v.coords, dims=v.dims) result = self.isel(drop=drop, **pos_indexers) return result._replace_indexes(new_indexes) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 448bf759334..e3763303704 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -94,22 +94,20 @@ def get_loc(index, label, method=None, tolerance=None): return index.get_loc(label, **kwargs) -def get_indexer(index, labels, method=None, tolerance=None): +def get_indexer_nd(index, labels, method=None, tolerance=None): """ Call pd.Index.get_indexer(labels). If labels are Variable, The return type is also a Variable with the same dimension to labels. """ from .variable import Variable - kwargs = _index_method_kwargs(method, tolerance) + + flat_labels = np.ravel(labels) + flat_indexer = index.get_indexer(flat_labels, **kwargs) + indexer = flat_indexer.reshape(labels.shape) if isinstance(labels, Variable): - if labels.ndim > 1: - indexers = np.array(index.get_indexer(labels.data.flatten(), - **kwargs)) - return Variable(labels.dims, indexers.reshape(labels.shape)) - else: - return Variable(labels.dims, index.get_indexer(labels, **kwargs)) - return index.get_indexer(labels, **kwargs) + indexer = Variable(labels.dims, indexer) + return indexer def convert_label_indexer(index, label, index_name='', method=None, @@ -167,7 +165,7 @@ def convert_label_indexer(index, label, index_name='', method=None, elif label.dtype.kind == 'b': indexer = label else: - indexer = get_indexer(index, label, method, tolerance) + indexer = get_indexer_nd(index, label, method, tolerance) if np.any(indexer < 0): raise KeyError('not all values found in index %r' % index_name) @@ -332,8 +330,9 @@ def __init__(self, array, key=None): def _updated_key(self, new_key): # TODO should suport VectorizedIndexer if isinstance(new_key, VectorizedIndexer): - raise NotImplementedError('Vectorized indexing for {} is not ' - 'implemented.'.format(type(self))) + raise NotImplementedError( + 'Vectorized indexing for {} is not implemented. Load your ' + 'data first with .load() or .compute().'.format(type(self))) new_key = iter(expanded_indexer(new_key, self.ndim)) key = [] for size, k in zip(self.array.shape, self.key): @@ -512,7 +511,7 @@ def __init__(self, array): def __getitem__(self, key): def to_int_tuple(key): # workaround for uint64 indexer (GH:1406) - # TODO remove here after next dask releas (0.15.3) + # TODO remove here after next dask release (0.15.3) return tuple([k.astype(int) if isinstance(k, np.ndarray) else k for k in key]) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 873478875f5..a721425b839 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -108,10 +108,7 @@ def _advanced_indexer_subspaces(key): return (), () non_slices = [k for k in key if not isinstance(k, slice)] - if len(non_slices) == 1: # older np.broadcast does not support one array - ndim = len(getattr(non_slices[0], 'shape', [])) # should be 0 for int - else: - ndim = len(np.broadcast(*non_slices).shape) + ndim = len(np.broadcast(*non_slices).shape) mixed_positions = advanced_index_positions[0] + np.arange(ndim) vindex_positions = np.arange(ndim) return mixed_positions, vindex_positions diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 437384dcdb7..dc315fd218d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1392,13 +1392,13 @@ def chunk(self, chunks=None, name=None, lock=False): def __getitem__(self, key): dims, index_tuple, new_order = self._broadcast_indexes(key) if len(dims) > 1: - # returns Variable rather than IndexVariable if multi-dimensional return Variable(self.dims, self.data, self._attrs, self._encoding, fastpath=True)[key] values = self._indexable_data[index_tuple] - if getattr(values, 'ndim', 0) == 0: - return Variable((), values, self._attrs, self._encoding) + if getattr(values, 'ndim', 0) != 1: + # returns Variable rather than IndexVariable if multi-dimensional + return Variable(dims, values, self._attrs, self._encoding) else: return type(self)(dims, values, self._attrs, self._encoding, fastpath=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a0b226f9907..79d3fe24100 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +import warnings import xarray as xr import pytest @@ -976,7 +977,7 @@ def test_isel_fancy(self): self.assertArrayEqual(actual['var2'], expected_var2) self.assertArrayEqual(actual['var3'], expected_var3) - def test_isel_dataarray(self): + def array(self): """ Test for indexing by DataArray """ import warnings data = create_test_data() @@ -1017,8 +1018,7 @@ def test_isel_dataarray(self): indexing_da = (indexing_da < 3) actual = data.isel(dim2=indexing_da) self.assertDataArrayIdentical( - actual['dim2'].drop('non_dim').drop('non_dim2').drop('non_dim3'), - data['dim2'][:2]) + actual['dim2'].drop('non_dim').drop('non_dim2'), data['dim2'][:2]) self.assertDataArrayIdentical( actual['non_dim'], indexing_da['non_dim'][:2]) self.assertDataArrayIdentical( @@ -1100,9 +1100,18 @@ def test_sel_dataarray(self): # with different dimension ind = DataArray([0.0, 0.5, 1.0], dims=['new_dim']) actual = data.sel(dim2=ind) - expected = data.isel(dim2=[0, 1, 2]).rename({'dim2': 'new_dim'}) + expected = data.isel(dim2=Variable('new_dim', [0, 1, 2])) assert 'new_dim' in actual.dims - self.assertDatasetEqual(actual.drop('dim2'), expected.drop('new_dim')) + self.assertDatasetEqual(actual, expected) + + # Multi-dimensional + ind = DataArray([[0.0], [0.5], [1.0]], dims=['new_dim', 'new_dim2']) + actual = data.sel(dim2=ind) + expected = data.isel(dim2=Variable(('new_dim', 'new_dim2'), + [[0], [1], [2]])) + assert 'new_dim' in actual.dims + assert 'new_dim2' in actual.dims + self.assertDatasetEqual(actual, expected) # with coordinate ind = DataArray([0.0, 0.5, 1.0], dims=['new_dim'], @@ -1485,21 +1494,20 @@ def test_reindex(self): self.assertDatasetIdentical(expected, actual) def test_reindex_warning(self): - import warnings data = create_test_data() with pytest.warns(FutureWarning) as ws: # DataArray with different dimension raises Future warning ind = xr.DataArray([0.0, 1.0], dims=['new_dim'], name='ind') data.reindex(dim2=ind) - assert any(["Indexer ind has dimensions new_dim that are" not in + assert any(["Indexer has dimensions " in str(w.message) for w in ws]) with pytest.warns(FutureWarning) as ws: # Should not warn ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') data.reindex(dim2=ind) - assert all(["Indexer ind has dimensions new_dim that are" not in + assert all(["Indexer has dimensions " not in str(w.message) for w in ws]) warnings.warn('dummy', FutureWarning, stacklevel=3) From 5726c89181d54e2928e7d37a9e627a7e9e9275a9 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 5 Sep 2017 23:23:57 +0900 Subject: [PATCH 093/113] Enable vindex for PandasIndexAdapter. --- xarray/core/indexing.py | 7 +++++-- xarray/core/variable.py | 4 ---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e3763303704..f841084b589 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -577,13 +577,16 @@ def shape(self): # .shape is broken on pandas prior to v0.15.2 return (len(self.array),) - def __getitem__(self, key): + def __getitem__(self, tuple_key): + key = to_tuple(tuple_key) if isinstance(key, tuple) and len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) key, = key - key = to_tuple(key) + if getattr(key, 'ndim', 0) > 1: # Return np-array if multidimensional + return NumpyIndexingAdapter(self.array.values)[tuple_key] + result = self.array[key] if isinstance(result, pd.Index): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index dc315fd218d..e5de16bb45a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1391,10 +1391,6 @@ def chunk(self, chunks=None, name=None, lock=False): def __getitem__(self, key): dims, index_tuple, new_order = self._broadcast_indexes(key) - if len(dims) > 1: - return Variable(self.dims, self.data, self._attrs, self._encoding, - fastpath=True)[key] - values = self._indexable_data[index_tuple] if getattr(values, 'ndim', 0) != 1: # returns Variable rather than IndexVariable if multi-dimensional From 523ecaa9afbcda5a353aab2e89aa7b671b9dc187 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 6 Sep 2017 20:43:28 +0900 Subject: [PATCH 094/113] Add deprecation warning for isel_points --- xarray/core/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cd29e8973ae..7e378eec584 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1317,6 +1317,9 @@ def isel_points(self, dim='points', **indexers): Dataset.sel_points DataArray.isel_points """ + warnings.warn('Dataset.isel_points is deprecated: use Dataset.isel()' + 'instead.', DeprecationWarning, stacklevel=2) + indexer_dims = set(indexers) def take(variable, slices): @@ -1463,7 +1466,7 @@ def sel_points(self, dim='points', method=None, tolerance=None, DataArray.sel_points """ warnings.warn('Dataset.sel_points is deprecated: use Dataset.sel()' - 'instead', DeprecationWarning, stacklevel=2) + 'instead.', DeprecationWarning, stacklevel=2) pos_indexers, _ = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance From 765ae45f5093ed1475f856f1c9e9ef5b769bee7e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 6 Sep 2017 23:05:28 +0900 Subject: [PATCH 095/113] Add a sanity check for boolean vectorized indexing. --- xarray/core/dataset.py | 27 ++++++++++++++++----------- xarray/core/variable.py | 5 +++++ xarray/tests/test_variable.py | 5 +++++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 65889132cac..bb0bf44327d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1168,10 +1168,9 @@ def _get_indexers_coordinates(self, indexers): if v.ndim != 1: # we only support 1-d boolean array raise ValueError( '{0:d}d-boolean array is used for indexing. ' - 'Only 1d-array is supported for boolean ' - 'indexing'.format(v.ndim)) + 'Only 1d-array is supported.'.format(v.ndim)) # Make sure in case of boolean DataArray, its - # coordinate is also indexed. + # coordinate also should be indexed. v = v[v.values.nonzero()[0]] coords = {d: v.coords[d].variable for d in v.coords} @@ -1203,15 +1202,17 @@ def isel(self, drop=False, **indexers): by integers, slice objects or arrays. indexer can be a integer, slice, array-like or even DataArray. If DataArrays are passed as indexers, xarray-style indexing will be - carried out. + carried out. See :ref:`indexing` for the details. Returns ------- obj : Dataset A new Dataset with the same contents as this dataset, except each - array and dimension is indexed by the appropriate indexers. In - general, each array's data will be a view of the array's data - in this dataset, unless numpy fancy indexing was triggered by using + array and dimension is indexed by the appropriate indexers. + If indexer DataArrays have coordinates that do not conflict to this + object, then these coordinates will be attached. + In general, each array's data will be a view of the array's data + in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. See Also @@ -1220,7 +1221,6 @@ def isel(self, drop=False, **indexers): DataArray.isel """ indexers_list = self._validate_indexers(indexers) - coord_vars = self._get_indexers_coordinates(indexers) variables = OrderedDict() @@ -1275,16 +1275,21 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): by scalars, slices or arrays of tick labels. For dimensions with multi-index, the indexer may also be a dict-like object with keys matching index level names. + If DataArrays are passed as indexers, xarray-style indexing will be + carried out. See :ref:`indexing` for the details. Returns ------- obj : Dataset A new Dataset with the same contents as this dataset, except each - variable and dimension is indexed by the appropriate indexers. In - general, each variable's data will be a view of the variable's data - in this dataset, unless numpy fancy indexing was triggered by using + variable and dimension is indexed by the appropriate indexers. + If indexer DataArrays have coordinates that do not conflict to this + object, then these coordinates will be attached. + In general, each array's data will be a view of the array's data + in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. + See Also -------- Dataset.isel diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b0057387f46..57eb3f6fe19 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -521,6 +521,11 @@ def _broadcast_indexes_vectorized(self, key): raise IndexError("{}-dimensional boolean indexing is " "not supported. ".format( variable.ndim)) + if self.shape[self.get_axis_num(dim)] != len(variable): + raise IndexError( + "Boolean array size {0:d} is used to index array " + "with shape {1:s}.".format(len(variable), + str(self.shape))) (variable,) = variable._nonzero() variables.append(variable) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5d91c0d5f05..ecfdc956f09 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -539,6 +539,11 @@ def test_getitem_advanced(self): expected = v[dict(y=2)] self.assertArrayEqual(v_new, expected) + # with boolean variable with wrong shape + ind = np.array([True, False]) + with self.assertRaisesRegexp(IndexError, 'Boolean array size 2 is '): + v[Variable(('a', 'b'), [[0, 1]]), ind] + def test_getitem_uint_1d(self): # regression test for #1405 v = self.cls(['x'], [0, 1, 2]) From 3deaf5c10897a78d5f4e30ae2105222620392530 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 6 Sep 2017 23:37:49 +0900 Subject: [PATCH 096/113] Modify tests to take care of the sanity check related to boolean array indexing. --- xarray/core/dataset.py | 4 +++- xarray/core/indexing.py | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bb0bf44327d..268da666030 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1297,7 +1297,7 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): """ from .dataarray import DataArray - v_indexers = {k: v.variable if isinstance(v, DataArray) else v + v_indexers = {k: v.variable.data if isinstance(v, DataArray) else v for k, v in indexers.items()} pos_indexers, new_indexes = indexing.remap_label_indexers( @@ -1305,6 +1305,8 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): ) # attach indexer's coordinate to pos_indexers for k, v in indexers.items(): + if isinstance(v, Variable): + pos_indexers[k] = Variable(v.dims, pos_indexers[k]) if isinstance(v, DataArray): pos_indexers[k] = DataArray(pos_indexers[k], coords=v.coords, dims=v.dims) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f841084b589..57bfdbc5b13 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -99,14 +99,11 @@ def get_indexer_nd(index, labels, method=None, tolerance=None): The return type is also a Variable with the same dimension to labels. """ - from .variable import Variable kwargs = _index_method_kwargs(method, tolerance) flat_labels = np.ravel(labels) flat_indexer = index.get_indexer(flat_labels, **kwargs) indexer = flat_indexer.reshape(labels.shape) - if isinstance(labels, Variable): - indexer = Variable(labels.dims, indexer) return indexer @@ -156,7 +153,8 @@ def convert_label_indexer(index, label, index_name='', method=None, ) else: - label = label if hasattr(label, 'dims') else _asarray_tuplesafe(label) + label = (label if getattr(label, 'ndim', 1) > 1 # vectorized-indexing + else _asarray_tuplesafe(label)) if label.ndim == 0: if isinstance(index, pd.MultiIndex): indexer, new_index = index.get_loc_level(label.item(), level=0) From c8c8a12ab3f1107833c11b1d4c8143927689bccf Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 6 Sep 2017 23:38:48 +0900 Subject: [PATCH 097/113] Another follow up --- xarray/tests/test_indexing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 8e4b88011bc..0002835cea7 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -243,6 +243,11 @@ def nonzero(x): np.arange(10) < 5] for i, j, k in itertools.product(indexers, repeat=3): + if isinstance(j, np.ndarray) and j.dtype.kind == 'b': # match size + j = np.arange(20) < 4 + if isinstance(k, np.ndarray) and k.dtype.kind == 'b': + k = np.arange(30) < 8 + _, expected, new_order = v._broadcast_indexes_vectorized((i, j, k)) expected_data = nputils.NumpyVIndexAdapter(v.data)[expected] if new_order: From a16a04b98dc706a3da5a26c824883e6031e838a3 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 7 Sep 2017 00:18:24 +0900 Subject: [PATCH 098/113] pep8 --- xarray/core/dataset.py | 8 ++++---- xarray/tests/test_dataset.py | 1 - xarray/tests/test_indexing.py | 3 ++- xarray/tests/test_variable.py | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 268da666030..030e8f0855d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1209,8 +1209,8 @@ def isel(self, drop=False, **indexers): obj : Dataset A new Dataset with the same contents as this dataset, except each array and dimension is indexed by the appropriate indexers. - If indexer DataArrays have coordinates that do not conflict to this - object, then these coordinates will be attached. + If indexer DataArrays have coordinates that do not conflict with + this object, then these coordinates will be attached. In general, each array's data will be a view of the array's data in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. @@ -1283,8 +1283,8 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): obj : Dataset A new Dataset with the same contents as this dataset, except each variable and dimension is indexed by the appropriate indexers. - If indexer DataArrays have coordinates that do not conflict to this - object, then these coordinates will be attached. + If indexer DataArrays have coordinates that do not conflict with + this object, then these coordinates will be attached. In general, each array's data will be a view of the array's data in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 18ca18e8a34..450e9c9863e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -985,7 +985,6 @@ def test_isel_fancy(self): def array(self): """ Test for indexing by DataArray """ - import warnings data = create_test_data() # indexing with DataArray with same-name coordinates. indexing_da = DataArray(np.arange(1, 4), dims=['dim1'], diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 0002835cea7..e6794823a32 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -252,7 +252,8 @@ def nonzero(x): expected_data = nputils.NumpyVIndexAdapter(v.data)[expected] if new_order: old_order = range(len(new_order)) - expected_data = np.moveaxis(expected_data, old_order, new_order) + expected_data = np.moveaxis(expected_data, old_order, + new_order) outer_index = indexing.OuterIndexer( (nonzero(i), nonzero(j), nonzero(k))) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index ecfdc956f09..cd63a07eb26 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -569,7 +569,6 @@ def test_getitem_0d_array(self): v = self.cls(['x'], [0, 1, 2]) v_data = v.compute().data - ind = np.array(0, dtype='int') # This is 0d-array v_new = v[np.array([0])[0]] self.assertArrayEqual(v_new, v_data[0]) From 1b34cd4df536f81b87ace3d56b110bca04d4fcd0 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 7 Sep 2017 22:09:41 +0900 Subject: [PATCH 099/113] Clean up sanity checks in broadcast_indexers --- xarray/core/dataset.py | 5 ++-- xarray/core/variable.py | 52 ++++++++++++++++++++-------------- xarray/tests/test_dataarray.py | 6 ---- xarray/tests/test_indexing.py | 4 +++ xarray/tests/test_variable.py | 25 +++++++++++----- 5 files changed, 56 insertions(+), 36 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 030e8f0855d..cd5dc484f09 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1164,6 +1164,7 @@ def _get_indexers_coordinates(self, indexers): coord_list = [] for k, v in indexers.items(): if isinstance(v, DataArray): + v_coords = v.coords if v.dtype.kind == 'b': if v.ndim != 1: # we only support 1-d boolean array raise ValueError( @@ -1171,8 +1172,8 @@ def _get_indexers_coordinates(self, indexers): 'Only 1d-array is supported.'.format(v.ndim)) # Make sure in case of boolean DataArray, its # coordinate also should be indexed. - v = v[v.values.nonzero()[0]] - coords = {d: v.coords[d].variable for d in v.coords} + v_coords = v[v.values.nonzero()[0]].coords + coords = {d: v_coords[d].variable for d in v.coords} for k, vc in self.variables.items(): if k in coords and not vc[v.values].equals(coords[k]): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 57eb3f6fe19..9906327efb5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -446,6 +446,7 @@ def _broadcast_indexes(self, key): if all(isinstance(k, BASIC_INDEXING_TYPES) for k in key): return self._broadcast_indexes_basic(key) + self._validate_indexers(key) # Detect it can be mapped as an outer indexer # If all key is unlabeled, or # key can be mapped as an OuterIndexer. @@ -473,6 +474,34 @@ def _broadcast_indexes_basic(self, key): if not isinstance(k, integer_types)) return dims, BasicIndexer(key), None + def _validate_indexers(self, key): + """ Make sanity checks """ + for dim, k in zip(self.dims, key): + if isinstance(k, BASIC_INDEXING_TYPES): + pass + else: + if not isinstance(k, Variable): + k = np.asarray(k) + if k.ndim > 1: + raise IndexError( + "Unlabeled multi-dimensional array cannot be " + "used for indexing: {}".format(k)) + if k.dtype.kind == 'b': + if self.shape[self.get_axis_num(dim)] != len(k): + raise IndexError( + "Boolean array size {0:d} is used to index array " + "with shape {1:s}.".format(len(k), + str(self.shape))) + if k.ndim > 1: + raise IndexError("{}-dimensional boolean indexing is " + "not supported. ".format(k.ndim)) + if getattr(k, 'dims', (dim, )) != (dim, ): + raise IndexError( + "Boolean indexer should be unlabeled or on the " + "same dimension to the indexed array. Indexer is " + "on {0:s} but the target dimension is " + "{1:s}.".format(str(k.dims), dim)) + def _broadcast_indexes_outer(self, key): dims = tuple(k.dims[0] if isinstance(k, Variable) else dim for k, dim in zip(key, self.dims) @@ -486,10 +515,6 @@ def _broadcast_indexes_outer(self, key): indexer.append(k) else: k = np.asarray(k) - if k.ndim > 1: - raise IndexError("Unlabeled multi-dimensional array " - "cannot be used for indexing: {}".format( - k)) indexer.append(k if k.dtype.kind != 'b' else np.flatnonzero(k)) return dims, OuterIndexer(indexer), None @@ -502,30 +527,15 @@ def _nonzero(self): in zip(nonzeros, self.dims)) def _broadcast_indexes_vectorized(self, key): - variables = [] out_dims_set = OrderedSet() for dim, value in zip(self.dims, key): if isinstance(value, slice): out_dims_set.add(dim) else: - try: - variable = (value if isinstance(value, Variable) else - as_variable(value, name=dim)) - except MissingDimensionsError: # change to better exception - raise IndexError("Unlabeled multi-dimensional array " - "cannot be used for indexing.") - + variable = (value if isinstance(value, Variable) else + as_variable(value, name=dim)) if variable.dtype.kind == 'b': # boolean indexing case - if variable.ndim > 1: - raise IndexError("{}-dimensional boolean indexing is " - "not supported. ".format( - variable.ndim)) - if self.shape[self.get_axis_num(dim)] != len(variable): - raise IndexError( - "Boolean array size {0:d} is used to index array " - "with shape {1:s}.".format(len(variable), - str(self.shape))) (variable,) = variable._nonzero() variables.append(variable) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d0a122e35b4..0051152e4f6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -490,12 +490,6 @@ def test_getitem_dataarray(self): self.assertDataArrayEqual(da[ind], da[[0, 1]]) self.assertDataArrayEqual(da[ind], da[ind.values]) - ind = xr.DataArray([True, True, False], dims=['a'], - coords={'a': [0, 1, 2]}) - actual = da[ind] - assert 'a' in actual - self.assertArrayEqual(actual['a'], [0, 1]) - def test_setitem(self): # basic indexing should work as numpy's indexing tuples = [(0, 0), (0, slice(None, None)), diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index e6794823a32..7c8af81baf8 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -153,6 +153,10 @@ def test_lazily_indexed_array(self): for i in indexers: for j in indexers: for k in indexers: + if isinstance(j, np.ndarray) and j.dtype.kind == 'b': + j = np.arange(20) < 5 + if isinstance(k, np.ndarray) and k.dtype.kind == 'b': + k = np.arange(30) < 5 expected = np.asarray(v[i, j, k]) for actual in [v_lazy[i, j, k], v_lazy[:, j, k][i], diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cd63a07eb26..c93b4d98367 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -86,6 +86,17 @@ def test_getitem_1d_fancy(self): expected = np.array(v._data)[([0, 1], [0, 1]), ] self.assertArrayEqual(v_new, expected) + # boolean indexing + ind = Variable(('x', ), [True, False, True]) + v_new = v[ind] + self.assertVariableIdentical(v[[0, 2]], v_new) + v_new = v[[True, False, True]] + self.assertVariableIdentical(v[[0, 2]], v_new) + + with self.assertRaisesRegexp(IndexError, "Boolean indexer should"): + ind = Variable(('a', ), [True, False, True]) + v[ind] + def _assertIndexedLikeNDArray(self, variable, expected_value0, expected_dtype=None): """Given a 1-dimensional variable, verify that the variable is indexed @@ -524,15 +535,10 @@ def test_getitem_advanced(self): self.assertArrayEqual(v_new, v_data[[0, 1]][:, [0, 1]]) # boolean indexing - v_new = v[dict(x=[True, False], y=[False, True])] + v_new = v[dict(x=[True, False], y=[False, True, False])] assert v_new.dims == ('x', 'y') self.assertArrayEqual(v_new, v_data[0][1]) - ind = Variable(['a'], [True, False]) - v_new = v[dict(y=ind)] - assert v_new.dims == ('x', 'a') - self.assertArrayEqual(v_new, v_data[:, 0:1]) - # with scalar variable ind = Variable((), 2) v_new = v[dict(y=ind)] @@ -544,6 +550,11 @@ def test_getitem_advanced(self): with self.assertRaisesRegexp(IndexError, 'Boolean array size 2 is '): v[Variable(('a', 'b'), [[0, 1]]), ind] + # boolean indexing with different dimension + ind = Variable(['a'], [True, False, False]) + with self.assertRaisesRegexp(IndexError, 'Boolean indexer should be'): + v[dict(y=ind)] + def test_getitem_uint_1d(self): # regression test for #1405 v = self.cls(['x'], [0, 1, 2]) @@ -1384,7 +1395,7 @@ def test_setitem(self): self.assertArrayEqual(v[0], np.ones_like(v[0])) v = Variable(['x', 'y'], [[0, 3, 2], [3, 4, 5]]) - v[dict(x=[True, False], y=[False, True])] = 1 + v[dict(x=[True, False], y=[False, True, False])] = 1 self.assertTrue(v[0, 1] == 1) # dimension broadcast From 24599a756ccf98c3b0317523cd03850678c24986 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 11 Sep 2017 10:07:00 +0900 Subject: [PATCH 100/113] Fix unintended rename --- xarray/tests/test_dataset.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 450e9c9863e..ae174e39096 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -983,7 +983,7 @@ def test_isel_fancy(self): self.assertArrayEqual(actual['var2'], expected_var2) self.assertArrayEqual(actual['var3'], expected_var3) - def array(self): + def test_isel_dataarray(self): """ Test for indexing by DataArray """ data = create_test_data() # indexing with DataArray with same-name coordinates. @@ -1011,14 +1011,17 @@ def array(self): self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) # boolean data array with coordinate with the same name + indexing_da = DataArray(np.arange(1, 10), dims=['dim2'], + coords={'dim2': data['dim2'].values}) indexing_da = (indexing_da < 3) actual = data.isel(dim2=indexing_da) self.assertDataArrayIdentical(actual['dim2'], data['dim2'][:2]) # boolean data array with non-dimensioncoordinate - indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], - coords={'dim2': data['dim2'].values[1:4], - 'non_dim': (('dim2', ), [0, 1, 4]), + indexing_da = DataArray(np.arange(1, 10), dims=['dim2'], + coords={'dim2': data['dim2'].values, + 'non_dim': (('dim2', ), + np.random.randn(9)), 'non_dim2': 0}) indexing_da = (indexing_da < 3) actual = data.isel(dim2=indexing_da) @@ -1029,15 +1032,6 @@ def array(self): self.assertDataArrayIdentical( actual['non_dim2'], indexing_da['non_dim2']) - # boolean data array with coordinate with the different name - indexing_da = DataArray(np.arange(1, 4), dims=['new_dim'], - coords={'new_dim': np.random.randn(3)}) - actual = data.isel(dim2=indexing_da < 3) - assert 'new_dim' in actual - assert 'new_dim' in actual.coords - self.assertDataArrayIdentical(actual['new_dim'].drop('dim2'), - indexing_da['new_dim'][:2]) - # non-dimension coordinate will be also attached indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': np.random.randn(3), From d5d967b76af46db918f35f505c7933a19b509cfd Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 10 Sep 2017 19:03:17 -0700 Subject: [PATCH 101/113] indexing.rst edits --- doc/indexing.rst | 125 ++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 51 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 471133814f4..ab3b678d649 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -11,19 +11,19 @@ Indexing and selecting data import xarray as xr np.random.seed(123456) +xarray offers extremely flexible indexing routines that combine the best +features of NumPy and pandas for data selection. -The point of xarray is to introduce a numpy-ndarray-like multidimensional array object into a powerful pandas's flexible data handling scheme. -We provide several (say, numpy-like, pandas-like, and more advanced type) indexing functionalities. - -The most basic way to access each element of xarray's multi-dimensional -object is to use Python ``[obj]`` syntax, such as ``array[i, j]``, where ``i`` and ``j`` are both integers. -As xarray objects can store coordinates corresponding to each dimension of the +The most basic way to access elements of a :py:class:`~xarray.DataArray` +object is to use Python's ``[]`` syntax, such as ``array[i, j]``, where +``i`` and ``j`` are both integers. +As xarray objects can store coordinates corresponding to each dimension of an array, label-based indexing similar to ``pandas.DataFrame.loc`` is also possible. In label-based indexing, the element position ``i`` is automatically looked-up from the coordinate values. -Dimensions of xarray object have names and you can also lookup the dimensions -by name, instead of remembering the positional ordering of dimensions by yourself. +Dimensions of xarray objects have names, so you can also lookup the dimensions +by name, instead of remembering their positional order. Thus in total, xarray supports four different kinds of indexing, as described below and summarized in this table: @@ -271,13 +271,15 @@ elements that are fully masked: Vectorized Indexing ------------------- -xarray supports many types of indexing with a `vectorized` manner. +Like numpy and pandas, xarray supports indexing many array elements at once in a +`vectorized` manner. -If you provide an integer, slice, or unlabeled array (array without dimension names, such as ``np.ndarray``, ``list``, but not :py:meth:`~xarray.DataArray` or :py:meth:`~xarray.Variable`) -our indexing is basically orthogonal. -For example, -if you pass multiple integer sequences to an array, they work independently -along each dimension (similar to the way vector subscripts work in fortran). +If you only provide integers, slices, or unlabeled arrays (array without +dimension names, such as ``np.ndarray``, ``list``, but not +:py:meth:`~xarray.DataArray` or :py:meth:`~xarray.Variable`) indexing can be +understand as orthogonally. Each indexer component selects independently along +the corresponding dimension, similar to how vector indexing works in Fortran or +MATLAB, or after using the :py:func:`numpy.xi_` helper: .. ipython:: python @@ -285,13 +287,11 @@ along each dimension (similar to the way vector subscripts work in fortran). coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) da da[[0, 1], [1, 1]] - # Sequential indexing gives the same result. - da[[0, 1], [1, 1]] == da[[0, 1]][:, [1, 1]] -In order to make more advanced indexing, you can supply -:py:meth:`~xarray.DataArray` as indexers. -In this case, the dimension of the resultant array is determined -by the indexers' dimension names, +For more flexibility, you can supply :py:meth:`~xarray.DataArray` objects +as indexers. +Dimensions on resultant arrays are given by the ordered union of the indexers' +dimensions: .. ipython:: python @@ -300,9 +300,8 @@ by the indexers' dimension names, da[ind_x, ind_y] # orthogonal indexing da[ind_x, ind_x] # vectorized indexing -Slices or sequences, which do not have named-dimensions, -as a manner of fact, -will be understood as the same dimension which is indexed along. +Slices or sequences/arrays without named-dimensions are treated as if they have +the same dimension which is indexed along: .. ipython:: python @@ -312,17 +311,21 @@ will be understood as the same dimension which is indexed along. Furthermore, you can use multi-dimensional :py:meth:`~xarray.DataArray` as indexers, where the resultant array dimension is also determined by -indexers' dimension, +indexers' dimension: .. ipython:: python ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) da[ind] -To summarize, our advanced indexing is based on our broadcasting scheme. -See :ref:`xarray_indexing_rules` for the full list of our indexing rule. +In briefly, similar to how NumPy's `advanced indexing`_ works, vectorized +indexing for xarray is based on our +:ref:`broadcasting rules `. +See :ref:`indexing.rules` for the complete specification. + +.. _advanced indexing: https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html -These vectorized indexing also works with ``isel``, ``loc``, and ``sel``. +Vectorized indexing also works with ``isel``, ``loc``, and ``sel``: .. ipython:: python @@ -332,22 +335,28 @@ These vectorized indexing also works with ``isel``, ``loc``, and ``sel``. ind = xr.DataArray([['a', 'b'], ['b', 'a']], dims=['a', 'b']) da.loc[:, ind] # same to da.sel(y=ind) - -and also for Dataset +and also for ``Dataset`` .. ipython:: python ds2 = da.to_dataset(name='bar') ds2.isel(x=xr.DataArray([0, 1, 2], dims=['points'])) +.. tip:: + + If you are lazily loading your data from disk, not every form of vectorized + indexing is supported (or if supported, may not be supported efficiently). + You may find increased performance by loading your data into memory first, + e.g., with :py:meth:`~xarray.Dataset.load`. + .. note:: - This advanced indexing was newly added in v.0.10. - In the older version of xarray, dimensions of indexers are not used. - Special methods to realize some advanced indexing, + + Vectorized indexing is a new feature in v0.10. + In older versions of xarray, dimensions of indexers are ignored. + Dedicated methods for some advanced indexing use cases, ``isel_points`` and ``sel_points`` are now deprecated. See :ref:`more_advanced_indexing` for their alternative. - .. _assigning_values: Assigning values with indexing @@ -416,8 +425,8 @@ __ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-t More advanced indexing ----------------------- -The use of :py:meth:`~xarray.DataArray` as indexers enables very flexible indexing. -The following is an example of the pointwise indexing, +The use of :py:meth:`~xarray.DataArray` objects as indexers enables very +flexible indexing. The following is an example of the pointwise indexing: .. ipython:: python @@ -438,8 +447,8 @@ you can supply a :py:meth:`~xarray.DataArray` with a coordinate, coords={'z': ['a', 'b', 'c']}), y=xr.DataArray([0, 1, 0], dims='z')) - -Analogously, label-based pointwise-indexing is also possible by ``.sel`` method, +Analogously, label-based pointwise-indexing is also possible by the ``.sel`` +method: .. ipython:: python @@ -448,7 +457,6 @@ Analogously, label-based pointwise-indexing is also possible by ``.sel`` method, arr.sel(space=xr.DataArray(['IA', 'IL', 'IN'], dims=['new_time']), time=times) - .. _align and reindex: Align and reindex @@ -648,28 +656,43 @@ dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example above, ``mda.loc[{'one': 'a', 'two': 0}, :]`` or ``mda.loc[('a', 0), ...]``. -.. _xarray_indexing_rules: +.. _indexing.rules: + +Indexing rules +-------------- + +Here we describe the full rules xarray uses for vectorized indexing. Note that +this is for the purposes of explanation: for the sake of efficiency and to +support various backends, the actual implementation is different. -xarray indexing rules ---------------------- +0. (Only for label based indexing.) Look up positional indexes along each + dimension from the corresponding :py:class:`pandas.Index`. -The detailed indexing scheme in xarray is as follows. -(Note that it is for the explanation purpose and the actual implementation is differ.) +1. A full slice object ``:`` is inserted for each dimension without an indexer. -0. (Only for label based indexing.) Look up positional indexes along each dimension based on :py:class:`pandas.Index`. +2. ``slice`` objects are converted into arrays, given by + ``np.arange(*slice.indices(...))``. -1. ``slice`` is converted to an array, such that ``np.arange(*slice.indices(...))``. +3. Assume dimension names for array indexers without dimensions, such as + ``np.ndarray`` and ``list``, from the dimensions to be indexed along. + For example, ``v.isel(x=[0, 1])`` is understood as + ``v.isel(x=xr.DataArray([0, 1], dims=['x']))``. -2. Assume dimension names of array indexers without dimension, such as ``np.ndarray`` and ``list``, from the dimensions to be indexed along. For example, ``v.isel(x=[0, 1])`` is understood as ``v.isel(x=xr.DataArray([0, 1], dims=['x']))``. +4. For each variable in a ``Dataset`` or ``DataArray`` (the array and its + coordinates): -3. Broadcast all the indexers based on their dimension names (see :ref:`compute.broadcasting` for our name-based broadcasting). + a. Broadcast all relevant indexers based on their dimension names + (see :ref:`compute.broadcasting` for full details). -4. Index the object by the broadcasted indexers. + b. Index the underling array by the broadcast indexers, using NumPy's + advanced indexing rules. -5. If an indexer-DataArray has coordinates, attached them to the indexed object. +5. If any indexer DataArray has coordinates and no coordinate with the + same name exists, attach them to the indexed object. .. note:: - + There should not be a conflict between the coordinates of indexer- and indexed- DataArrays. In v.0.10.0, xarray raises ``FutureWarning`` if there is such a conflict, but in the next major release, it will raise an Error. + - ``IndexError`` is raised if there is a conflict between dimension + coordinates of indexer- and indexed- DataArrays. - + Only 1-dimensional boolean array can be used as an indexer. + - Only 1-dimensional boolean arrays can be used as indexers. From d0d6a6f645a3d5d0ff8e8ac298cd200e2fcf51ac Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 16 Sep 2017 21:58:52 -0700 Subject: [PATCH 102/113] remove note about conflicts for now --- doc/indexing.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index ab3b678d649..0b78b3384dc 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -692,7 +692,4 @@ support various backends, the actual implementation is different. .. note:: - - ``IndexError`` is raised if there is a conflict between dimension - coordinates of indexer- and indexed- DataArrays. - - - Only 1-dimensional boolean arrays can be used as indexers. + Only 1-dimensional boolean arrays can be used as indexers. From 4f08e2eb190cc1f9eb09e955fcf0fff0c00f508a Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 17 Sep 2017 23:06:13 +0900 Subject: [PATCH 103/113] Apply coordinate conflict rule. --- xarray/core/dataset.py | 73 +++++++++++++++++++++++++++++------- xarray/tests/test_dataset.py | 49 ++++++++++++++++-------- 2 files changed, 94 insertions(+), 28 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cd5dc484f09..998b625b4f3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1173,17 +1173,59 @@ def _get_indexers_coordinates(self, indexers): # Make sure in case of boolean DataArray, its # coordinate also should be indexed. v_coords = v[v.values.nonzero()[0]].coords - coords = {d: v_coords[d].variable for d in v.coords} - for k, vc in self.variables.items(): - if k in coords and not vc[v.values].equals(coords[k]): - del coords[k] - - coord_list.append(coords) + coord_list.append({d: v_coords[d].variable for d in v.coords}) # we don't need to call align() explicitly, because merge_variables # already checks for exact alignment between dimension coordinates - return merge_variables(coord_list) + coords = merge_variables(coord_list) + + for k in self.dims: + # make sure there are not conflict in dimension coordinates + if (k in coords and k in self._variables and + not coords[k].equals(self._variables[k])): + raise IndexError('Dimension coordinate {0:s} conflicts between' + ' indexed and indexing objects.'.format(k)) + return coords + + def _drop_nonpriority_coords(self, indexers, mode='sel'): + """ + Drop non-priority coords from indexers. + + indexers: mapping from dimension to indexers. + Mode: one of 'isel' | 'sel' + Returns: new indexer + + Common rule: + 1. If object is constructed from coordinate, the same name coordinates + of the indexer will be dropped. + + Rules for `sel` mode + 2. Indexed coordinates from the indexed object take precedence. + """ + from .dataarray import DataArray, _ThisArray + + # If Dataset is constructed from DataArray, skip consistency check + this_arr = None + for k, v in self._variables.items(): + if isinstance(k, _ThisArray): + this_arr = v + + def drop_coord(v, k): + return v.drop(k) if (isinstance(v, DataArray) and k in v) else v + + new_indexers = OrderedDict() + for k, v in indexers.items(): + # rule 1 + if (this_arr is not None and k in self._variables and + this_arr.equals(self._variables[k])): + new_indexers[k] = drop_coord(v, k) + # rule 2 + elif mode == 'sel' and k in self._coord_names: + new_indexers[k] = drop_coord(v, k) + else: + new_indexers[k] = v + return new_indexers def isel(self, drop=False, **indexers): """Returns a new dataset with each array indexed along the specified @@ -1221,8 +1263,8 @@ def isel(self, drop=False, **indexers): Dataset.sel DataArray.isel """ + indexers = self._drop_nonpriority_coords(indexers, mode='isel') indexers_list = self._validate_indexers(indexers) - coord_vars = self._get_indexers_coordinates(indexers) variables = OrderedDict() for name, var in iteritems(self._variables): @@ -1231,11 +1273,15 @@ def isel(self, drop=False, **indexers): if not (drop and name in var_indexers): variables[name] = new_var - # attach coordinate in indexers - variables.update(coord_vars) - - coord_names = set(variables) & set(self._coord_names) | set(coord_vars) - return self._replace_vars_and_dims(variables, coord_names=coord_names) + coord_names = set(variables) & set(self._coord_names) + selected = self._replace_vars_and_dims( + variables, coord_names=coord_names) + # Dataset consisting of only coordinate. + coord_vars = selected._get_indexers_coordinates(indexers) + coords = self._replace_vars_and_dims( + coord_vars, coord_names=list(coord_vars.keys())) + # Drop conflicted variable from indexers coordinate + return coords.update(selected) def sel(self, method=None, tolerance=None, drop=False, **indexers): """Returns a new dataset with each array indexed by tick labels @@ -1298,6 +1344,7 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): """ from .dataarray import DataArray + indexers = self._drop_nonpriority_coords(indexers, mode='sel') v_indexers = {k: v.variable.data if isinstance(v, DataArray) else v for k, v in indexers.items()} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index ae174e39096..2c396c0d0eb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -993,14 +993,16 @@ def test_isel_dataarray(self): self.assertDataArrayIdentical(indexing_da['dim1'], actual['dim1']) self.assertDataArrayIdentical(data['dim2'], actual['dim2']) - # not overwrite coordinate + # Conflict in the dimension coordinate indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': np.random.randn(3)}) - actual = data.isel(dim2=indexing_da) - self.assertDataArrayIdentical(actual['dim2'], - data['dim2'].isel(dim2=np.arange(1, 4))) + with self.assertRaisesRegexp(IndexError, "Dimension coordinate dim2"): + actual = data.isel(dim2=indexing_da) + # Also the case for DataArray + with self.assertRaisesRegexp(IndexError, "Dimension coordinate dim2"): + actual = data['var2'].isel(dim2=indexing_da) - # isel for the coordinate. Should not attach the coordinate + # isel for the coordinate variable. Should not attach the coordinate actual = data['dim2'].isel(dim2=indexing_da) self.assertDataArrayIdentical(actual, data['dim2'].isel(dim2=np.arange(1, 4))) @@ -1008,7 +1010,15 @@ def test_isel_dataarray(self): # same name coordinate which does not conflict indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': data['dim2'].values[1:4]}) - self.assertDataArrayIdentical(data['dim2'][1:4], indexing_da['dim2']) + actual = data.isel(dim2=indexing_da) + self.assertDataArrayIdentical(actual['dim2'], indexing_da['dim2']) + + # Silently drop conflicted (non-dimensional) coordinate of indexer + indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], + coords={'dim2': data['dim2'].values[1:4], + 'numbers': ('dim2', np.arange(2, 5))}) + actual = data.isel(dim2=indexing_da) + self.assertDataArrayIdentical(actual['numbers'], data['numbers']) # boolean data array with coordinate with the same name indexing_da = DataArray(np.arange(1, 10), dims=['dim2'], @@ -1034,20 +1044,12 @@ def test_isel_dataarray(self): # non-dimension coordinate will be also attached indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], - coords={'dim2': np.random.randn(3), - 'non_dim': (('dim2', ), + coords={'non_dim': (('dim2', ), np.random.randn(3))}) actual = data.isel(dim2=indexing_da) assert 'non_dim' in actual assert 'non_dim' in actual.coords - # indexing with DataArray with drop=True - indexing_da = DataArray(np.arange(1, 4), dims=['a'], - coords={'a': np.random.randn(3)}) - actual = data.isel(dim1=indexing_da) - assert 'a' in actual - assert 'dim1' not in actual - # Index by a scalar DataArray indexing_da = DataArray(3, dims=[], coords={'station': 2}) actual = data.isel(dim2=indexing_da) @@ -1124,6 +1126,23 @@ def test_sel_dataarray(self): self.assertDataArrayEqual(actual['new_dim'].drop('dim2'), ind['new_dim']) + # with conflicted coordinate (silently ignored) + ind = DataArray([0.0, 0.5, 1.0], dims=['dim2'], + coords={'dim2': ['a', 'b', 'c']}) + actual = data.sel(dim2=ind) + expected = data.isel(dim2=[0, 1, 2]) + self.assertDatasetEqual(actual, expected) + + # with non-dimensional coordinate + ind = DataArray([0.0, 0.5, 1.0], dims=['dim2'], + coords={'dim2': ['a', 'b', 'c'], + 'numbers': ('dim2', [0, 1, 2]), + 'new_dim': ('dim2', [1.1, 1.2, 1.3])}) + actual = data.sel(dim2=ind) + expected = data.isel(dim2=[0, 1, 2]) + self.assertDatasetEqual(actual.drop('new_dim'), expected) + assert np.allclose(actual['new_dim'].values, ind['new_dim'].values) + def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1}) From fbbe35cf2fb225fd78b398702b344b2e5f75576a Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 18 Sep 2017 13:49:14 +0900 Subject: [PATCH 104/113] Python 2 support --- xarray/core/dataset.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 998b625b4f3..bd7beb06db1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1186,7 +1186,12 @@ def _get_indexers_coordinates(self, indexers): not coords[k].equals(self._variables[k])): raise IndexError('Dimension coordinate {0:s} conflicts between' ' indexed and indexing objects.'.format(k)) - return coords + + attached_coords = OrderedDict() + for k, v in coords.items(): # silently drop the conflicted variables. + if k not in self._variables: + attached_coords[k] = v + return attached_coords def _drop_nonpriority_coords(self, indexers, mode='sel'): """ @@ -1276,12 +1281,12 @@ def isel(self, drop=False, **indexers): coord_names = set(variables) & set(self._coord_names) selected = self._replace_vars_and_dims( variables, coord_names=coord_names) - # Dataset consisting of only coordinate. + + # Extract coordinates from indexers coord_vars = selected._get_indexers_coordinates(indexers) - coords = self._replace_vars_and_dims( - coord_vars, coord_names=list(coord_vars.keys())) - # Drop conflicted variable from indexers coordinate - return coords.update(selected) + variables.update(coord_vars) + coord_names = set(variables) & set(self._coord_names) | set(coord_vars) + return self._replace_vars_and_dims(variables, coord_names=coord_names) def sel(self, method=None, tolerance=None, drop=False, **indexers): """Returns a new dataset with each array indexed by tick labels From db23c938d295cad3cd3c3666be4504d2cdc01260 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 18 Sep 2017 14:45:27 +0900 Subject: [PATCH 105/113] Add tests for mindex selection. --- xarray/core/indexing.py | 5 +---- xarray/tests/test_dataset.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 57bfdbc5b13..41dab8b08a8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -95,10 +95,7 @@ def get_loc(index, label, method=None, tolerance=None): def get_indexer_nd(index, labels, method=None, tolerance=None): - """ Call pd.Index.get_indexer(labels). If labels are Variable, - The return type is also a Variable with the same dimension to - labels. - """ + """ Call pd.Index.get_indexer(labels). """ kwargs = _index_method_kwargs(method, tolerance) flat_labels = np.ravel(labels) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2c396c0d0eb..1252a1bb482 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1143,6 +1143,26 @@ def test_sel_dataarray(self): self.assertDatasetEqual(actual.drop('new_dim'), expected) assert np.allclose(actual['new_dim'].values, ind['new_dim'].values) + def test_sel_dataarray_mindex(self): + midx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) + mds = xr.Dataset({'var': (('x', 'y'), np.random.rand(6, 3))}, + coords={'x': midx, 'y': range(3)}) + actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims='z')) + actual_sel = mds.sel(x=Variable('z', mds.indexes['x'][:3])) + assert actual_isel['x'].dims == ('z', ) + assert actual_sel['x'].dims == ('z', ) + self.assertDatasetIdentical(actual_isel, actual_sel) + + # with coordinate + actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims='z', + coords={'z': [0, 1, 2]})) + actual_sel = mds.sel(x=xr.DataArray(mds.indexes['x'][:3], dims='z', + coords={'z': [0, 1, 2]})) + assert actual_isel['x'].dims == ('z', ) + assert actual_sel['x'].dims == ('z', ) + self.assertDatasetIdentical(actual_isel, actual_sel) + def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1}) From 031be9a5be9537d4c5369ad565d484bfd3df0151 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 18 Sep 2017 18:47:24 +0900 Subject: [PATCH 106/113] Drop coordinate of itself. --- xarray/core/dataset.py | 28 ++++++++++++++++++---------- xarray/tests/test_dataset.py | 6 ++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bd7beb06db1..546f121502d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1205,8 +1205,11 @@ def _drop_nonpriority_coords(self, indexers, mode='sel'): 1. If object is constructed from coordinate, the same name coordinates of the indexer will be dropped. + 2. If an indexer is a DataArray with a coordinate of itself, + this coordinate will be dropped. + Rules for `sel` mode - 2. Indexed coordinates from the indexed object take precedence. + 3. Indexed coordinates from the indexed object take precedence. """ from .dataarray import DataArray, _ThisArray @@ -1221,15 +1224,20 @@ def drop_coord(v, k): new_indexers = OrderedDict() for k, v in indexers.items(): - # rule 1 - if (this_arr is not None and k in self._variables and - this_arr.equals(self._variables[k])): - new_indexers[k] = drop_coord(v, k) - # rule 2 - elif mode == 'sel' and k in self._coord_names: - new_indexers[k] = drop_coord(v, k) - else: - new_indexers[k] = v + # only consider DataArray + if isinstance(v, DataArray): + # rule 1 + if (this_arr is not None and k in self._variables and + this_arr.equals(self._variables[k])): + v = drop_coord(v, k) + # rule 2 + if (k in getattr(v, 'coords', {}) and + v.variable.equals(v.coords[k])): + v = drop_coord(v, k) + # rule 3 + if mode == 'sel' and k in self._coord_names: + v = drop_coord(v, k) + new_indexers[k] = v return new_indexers def isel(self, drop=False, **indexers): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 1252a1bb482..224feaf84f9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1057,6 +1057,12 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da['station']) assert 'station' in actual + # indexer generated from coordinates + indexing_ds = Dataset({}, coords={'dim2': [0, 1, 2]}) + actual = data.isel(dim2=indexing_ds['dim2']) + expected = data.isel(dim2=[0, 1, 2]) + self.assertDatasetIdentical(actual, expected) + def test_sel(self): data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), From 969f9cf759460dbcb1a90ae84c6c2610b32bb714 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 18 Sep 2017 20:28:48 +0900 Subject: [PATCH 107/113] Clean up the coordinate dropping logic. --- xarray/core/dataset.py | 10 +++++----- xarray/tests/test_dataset.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 546f121502d..ae79bafcb54 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1220,7 +1220,7 @@ def _drop_nonpriority_coords(self, indexers, mode='sel'): this_arr = v def drop_coord(v, k): - return v.drop(k) if (isinstance(v, DataArray) and k in v) else v + return v.drop(k) if k in v.coords else v new_indexers = OrderedDict() for k, v in indexers.items(): @@ -1228,12 +1228,12 @@ def drop_coord(v, k): if isinstance(v, DataArray): # rule 1 if (this_arr is not None and k in self._variables and - this_arr.equals(self._variables[k])): + this_arr is self._variables[k]): v = drop_coord(v, k) # rule 2 - if (k in getattr(v, 'coords', {}) and - v.variable.equals(v.coords[k])): - v = drop_coord(v, k) + for ck, cv in v.coords.items(): + if v.variable is cv.variable: + v = drop_coord(v, ck) # rule 3 if mode == 'sel' and k in self._coord_names: v = drop_coord(v, k) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 224feaf84f9..28eab924ec3 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1055,7 +1055,7 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da) assert 'station' in actual actual = data.isel(dim2=indexing_da['station']) - assert 'station' in actual + assert 'station' not in actual # indexer generated from coordinates indexing_ds = Dataset({}, coords={'dim2': [0, 1, 2]}) From b4e5b3624e3c0cb6048b39e8e28359a6d8495765 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 23 Sep 2017 00:17:16 +0900 Subject: [PATCH 108/113] A small bug fix in coordinate dropping logic --- xarray/core/dataset.py | 7 +++++-- xarray/tests/test_dataset.py | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ae79bafcb54..200b81f4b77 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1235,8 +1235,11 @@ def drop_coord(v, k): if v.variable is cv.variable: v = drop_coord(v, ck) # rule 3 - if mode == 'sel' and k in self._coord_names: - v = drop_coord(v, k) + if mode == 'sel': + coord_names = v._coords.keys() + for cname in coord_names: + if cname in self._coord_names: + v = drop_coord(v, cname) new_indexers[k] = v return new_indexers diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 28eab924ec3..b9590219ee8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1139,6 +1139,19 @@ def test_sel_dataarray(self): expected = data.isel(dim2=[0, 1, 2]) self.assertDatasetEqual(actual, expected) + # with conflicted coordinate (silently ignored) + ind = DataArray([0.0, 0.5, 1.0], dims=['new_dim'], + coords={'new_dim': ['a', 'b', 'c'], + 'dim2': 3}) + actual = data.sel(dim2=ind) + self.assertDataArrayEqual(actual['new_dim'].drop('dim2'), + ind['new_dim'].drop('dim2')) + expected = data.isel(dim2=[0, 1, 2]) + expected['dim2'] = (('new_dim'), expected['dim2'].values) + self.assertDataArrayEqual(actual['dim2'].drop('new_dim'), + expected['dim2']) + assert actual['var1'].dims == ('dim1', 'new_dim') + # with non-dimensional coordinate ind = DataArray([0.0, 0.5, 1.0], dims=['dim2'], coords={'dim2': ['a', 'b', 'c'], @@ -1154,6 +1167,13 @@ def test_sel_dataarray_mindex(self): names=('one', 'two')) mds = xr.Dataset({'var': (('x', 'y'), np.random.rand(6, 3))}, coords={'x': midx, 'y': range(3)}) + + actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims='x')) + actual_sel = mds.sel(x=DataArray(mds.indexes['x'][:3], dims='x')) + assert actual_isel['x'].dims == ('x', ) + assert actual_sel['x'].dims == ('x', ) + self.assertDatasetIdentical(actual_isel, actual_sel) + actual_isel = mds.isel(x=xr.DataArray(np.arange(3), dims='z')) actual_sel = mds.sel(x=Variable('z', mds.indexes['x'][:3])) assert actual_isel['x'].dims == ('z', ) From dc60348b3f435eab54cd24ca2d91e973311ece2c Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 27 Sep 2017 19:50:11 +0900 Subject: [PATCH 109/113] Fixes based on jhamman's comments. --- doc/indexing.rst | 12 ++++++------ doc/whats-new.rst | 7 +++++-- xarray/core/dataset.py | 15 ++++++++------- xarray/core/indexing.py | 2 +- xarray/core/variable.py | 4 ++-- xarray/tests/test_dataset.py | 9 ++++----- 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 0b78b3384dc..4c74976cbb0 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -318,7 +318,7 @@ indexers' dimension: ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) da[ind] -In briefly, similar to how NumPy's `advanced indexing`_ works, vectorized +Similar to how NumPy's `advanced indexing`_ works, vectorized indexing for xarray is based on our :ref:`broadcasting rules `. See :ref:`indexing.rules` for the complete specification. @@ -330,12 +330,12 @@ Vectorized indexing also works with ``isel``, ``loc``, and ``sel``: .. ipython:: python ind = xr.DataArray([[0, 1], [0, 1]], dims=['a', 'b']) - da.isel(y=ind) # same to da[:, ind] + da.isel(y=ind) # same as da[:, ind] ind = xr.DataArray([['a', 'b'], ['b', 'a']], dims=['a', 'b']) - da.loc[:, ind] # same to da.sel(y=ind) + da.loc[:, ind] # same as da.sel(y=ind) -and also for ``Dataset`` +These methods may and also be applied to ``Dataset`` objects .. ipython:: python @@ -380,7 +380,7 @@ Vectorized indexing can be used to assign values to xarray object. da[ind_x, ind_y] += 100 # increment is also possible da -As like numpy ndarray, value assignment sometimes works differently from what one may expect. +Like ``numpy.ndarray``, value assignment sometimes works differently from what one may expect. .. ipython:: python @@ -398,7 +398,7 @@ __ https://docs.scipy.org/doc/numpy/user/basics.indexing.html#assigning-values-t .. note:: - Dask backend does not yet support value assignment + Dask array does not support value assignment (see :ref:`dask` for the details). diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fb45f792746..eac7d92e46c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,10 +27,13 @@ Backward Incompatible Changes indexing, as well as vectorized indexing. Due to this change, existing uses of xarray objects to index other xarray objects will break in some cases. + ``isel_points`` / ``sel_points`` methods are deprecated, since the same thing + can be done by the new ``isel`` / ``sel`` methods. See :ref:`vectorized_indexing` for the details (:issue:`1444`, :issue:`1436`, ). By `Keisuke Fujii `_ and `Stephan Hoyer `_. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -67,9 +70,9 @@ Breaking changes [...] Note that both versions are currently supported, but using the old syntax will - produce a warning encouraging users to adopt the new syntax. + produce a warning encouraging users to adopt the new syntax. By `Daniel Rothenberg `_. - + - ``repr`` and the Jupyter Notebook won't automatically compute dask variables. Datasets loaded with ``open_dataset`` won't automatically read coords from disk when calling ``repr`` (:issue:`1522`). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 97d954c135b..a14fd177114 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1158,7 +1158,7 @@ def _get_indexers_coordinates(self, indexers): Returns an OrderedDict mapping from coordinate name to the coordinate variable. - Only coordinate with a name different from any of sef.variables will + Only coordinate with a name different from any of self.variables will be attached. """ from .dataarray import DataArray @@ -1261,7 +1261,7 @@ def isel(self, drop=False, **indexers): **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given by integers, slice objects or arrays. - indexer can be a integer, slice, array-like or even DataArray. + indexer can be a integer, slice, array-like or DataArray. If DataArrays are passed as indexers, xarray-style indexing will be carried out. See :ref:`indexing` for the details. @@ -1291,14 +1291,15 @@ def isel(self, drop=False, **indexers): if not (drop and name in var_indexers): variables[name] = new_var - coord_names = set(variables) & set(self._coord_names) - selected = self._replace_vars_and_dims( - variables, coord_names=coord_names) + coord_names = set(variables).intersection(self._coord_names) + selected = self._replace_vars_and_dims(variables, + coord_names=coord_names) # Extract coordinates from indexers coord_vars = selected._get_indexers_coordinates(indexers) variables.update(coord_vars) - coord_names = set(variables) & set(self._coord_names) | set(coord_vars) + coord_names = set(variables).intersection(self._coord_names).union( + coord_vars) return self._replace_vars_and_dims(variables, coord_names=coord_names) def sel(self, method=None, tolerance=None, drop=False, **indexers): @@ -1373,7 +1374,7 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): for k, v in indexers.items(): if isinstance(v, Variable): pos_indexers[k] = Variable(v.dims, pos_indexers[k]) - if isinstance(v, DataArray): + elif isinstance(v, DataArray): pos_indexers[k] = DataArray(pos_indexers[k], coords=v.coords, dims=v.dims) result = self.isel(drop=drop, **pos_indexers) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 41dab8b08a8..8758f6890cd 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -287,7 +287,7 @@ class BasicIndexer(IndexerTuple): class OuterIndexer(IndexerTuple): """ Tuple for outer/orthogonal indexing. - All the item is one of integer, slice, and 1d-np.ndarray. + All the items are one of integer, slice, and 1d-np.ndarray. """ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index dde7650626d..0b19ebe5599 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -114,8 +114,8 @@ def as_variable(obj, name=None): raise MissingDimensionsError( '%r has more than 1-dimension and the same name as one of its ' 'dimensions %r. xarray disallows such variables because they ' - 'conflict with the coordinates used to label dimensions.' - % (name, obj.dims)) + 'conflict with the coordinates used to label ' + 'dimensions.' % (name, obj.dims)) obj = obj.to_index_variable() return obj diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f2d0de3211b..887063007f2 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1391,9 +1391,8 @@ def test_sel_fancy(self): self.assertDataArrayIdentical(actual['a'].drop('x'), idx_x['a']) self.assertDataArrayIdentical(actual['b'].drop('y'), idx_y['b']) - if pd.__version__ >= '0.17': - with self.assertRaises(KeyError): - data.sel_points(x=[2.5], y=[2.0], method='pad', tolerance=1e-3) + with self.assertRaises(KeyError): + data.sel_points(x=[2.5], y=[2.0], method='pad', tolerance=1e-3) def test_sel_method(self): data = create_test_data() @@ -1564,9 +1563,9 @@ def test_reindex_warning(self): assert any(["Indexer has dimensions " in str(w.message) for w in ws]) + # Should not warn + ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') with pytest.warns(FutureWarning) as ws: - # Should not warn - ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') data.reindex(dim2=ind) assert all(["Indexer has dimensions " not in str(w.message) for w in ws]) From 8a62ad96bf847ecc077a65691a9159dff511612d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 28 Sep 2017 08:45:04 +0900 Subject: [PATCH 110/113] Improve test for warning. --- xarray/tests/test_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 887063007f2..86727c9d1e1 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1565,11 +1565,9 @@ def test_reindex_warning(self): # Should not warn ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') - with pytest.warns(FutureWarning) as ws: + with pytest.warns(None) as ws: data.reindex(dim2=ind) - assert all(["Indexer has dimensions " not in - str(w.message) for w in ws]) - warnings.warn('dummy', FutureWarning, stacklevel=3) + assert len(ws) == 0 def test_reindex_variables_copied(self): data = create_test_data() From cb84154392ce65d39defe23553b614d37fa83452 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 4 Oct 2017 10:08:23 +0900 Subject: [PATCH 111/113] Remove unused assert sentence. --- xarray/core/variable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0b19ebe5599..a3b5a4d0a88 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -597,7 +597,6 @@ def __getitem__(self, key): data = self._indexable_data[index_tuple] if new_order: data = np.moveaxis(data, range(len(new_order)), new_order) - assert getattr(data, 'ndim', 0) == len(dims), (data.ndim, len(dims)) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) From 9726531a6c85c1f241ff0d966348adfd7f685e16 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 9 Oct 2017 16:28:15 -0400 Subject: [PATCH 112/113] Simplify rules for indexing conflicts --- xarray/core/dataset.py | 74 +++++++---------------------------- xarray/tests/test_dataset.py | 25 ++++++------ xarray/tests/test_indexing.py | 4 +- 3 files changed, 29 insertions(+), 74 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a14fd177114..5dc9477c9c6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1170,8 +1170,9 @@ def _get_indexers_coordinates(self, indexers): if v.dtype.kind == 'b': if v.ndim != 1: # we only support 1-d boolean array raise ValueError( - '{0:d}d-boolean array is used for indexing. ' - 'Only 1d-array is supported.'.format(v.ndim)) + '{:d}d-boolean array is used for indexing along ' + 'dimension {!r}, but only 1d boolean arrays are ' + 'supported.'.format(v.ndim, k)) # Make sure in case of boolean DataArray, its # coordinate also should be indexed. v_coords = v[v.values.nonzero()[0]].coords @@ -1186,8 +1187,10 @@ def _get_indexers_coordinates(self, indexers): # make sure there are not conflict in dimension coordinates if (k in coords and k in self._variables and not coords[k].equals(self._variables[k])): - raise IndexError('Dimension coordinate {0:s} conflicts between' - ' indexed and indexing objects.'.format(k)) + raise IndexError( + 'dimension coordinate {!r} conflicts between ' + 'indexed and indexing objects:\n{}\nvs.\n{}' + .format(k, self._variables[k], coords[k])) attached_coords = OrderedDict() for k, v in coords.items(): # silently drop the conflicted variables. @@ -1195,56 +1198,6 @@ def _get_indexers_coordinates(self, indexers): attached_coords[k] = v return attached_coords - def _drop_nonpriority_coords(self, indexers, mode='sel'): - """ - Drop non-priority coords from indexers. - - indexers: mapping from dimension to indexers. - Mode: one of 'isel' | 'sel' - Returns: new indexer - - Common rule: - 1. If object is constructed from coordinate, the same name coordinates - of the indexer will be dropped. - - 2. If an indexer is a DataArray with a coordinate of itself, - this coordinate will be dropped. - - Rules for `sel` mode - 3. Indexed coordinates from the indexed object take precedence. - """ - from .dataarray import DataArray, _ThisArray - - # If Dataset is constructed from DataArray, skip consistency check - this_arr = None - for k, v in self._variables.items(): - if isinstance(k, _ThisArray): - this_arr = v - - def drop_coord(v, k): - return v.drop(k) if k in v.coords else v - - new_indexers = OrderedDict() - for k, v in indexers.items(): - # only consider DataArray - if isinstance(v, DataArray): - # rule 1 - if (this_arr is not None and k in self._variables and - this_arr is self._variables[k]): - v = drop_coord(v, k) - # rule 2 - for ck, cv in v.coords.items(): - if v.variable is cv.variable: - v = drop_coord(v, ck) - # rule 3 - if mode == 'sel': - coord_names = v._coords.keys() - for cname in coord_names: - if cname in self._coord_names: - v = drop_coord(v, cname) - new_indexers[k] = v - return new_indexers - def isel(self, drop=False, **indexers): """Returns a new dataset with each array indexed along the specified dimension(s). @@ -1281,7 +1234,6 @@ def isel(self, drop=False, **indexers): Dataset.sel DataArray.isel """ - indexers = self._drop_nonpriority_coords(indexers, mode='isel') indexers_list = self._validate_indexers(indexers) variables = OrderedDict() @@ -1298,8 +1250,9 @@ def isel(self, drop=False, **indexers): # Extract coordinates from indexers coord_vars = selected._get_indexers_coordinates(indexers) variables.update(coord_vars) - coord_names = set(variables).intersection(self._coord_names).union( - coord_vars) + coord_names = (set(variables) + .intersection(self._coord_names) + .union(coord_vars)) return self._replace_vars_and_dims(variables, coord_names=coord_names) def sel(self, method=None, tolerance=None, drop=False, **indexers): @@ -1363,7 +1316,6 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): """ from .dataarray import DataArray - indexers = self._drop_nonpriority_coords(indexers, mode='sel') v_indexers = {k: v.variable.data if isinstance(v, DataArray) else v for k, v in indexers.items()} @@ -1375,8 +1327,12 @@ def sel(self, method=None, tolerance=None, drop=False, **indexers): if isinstance(v, Variable): pos_indexers[k] = Variable(v.dims, pos_indexers[k]) elif isinstance(v, DataArray): + # drop coordinates found in indexers since .sel() already + # ensures alignments + coords = OrderedDict((k, v) for k, v in v._coords.items() + if k not in indexers) pos_indexers[k] = DataArray(pos_indexers[k], - coords=v.coords, dims=v.dims) + coords=coords, dims=v.dims) result = self.isel(drop=drop, **pos_indexers) return result._replace_indexes(new_indexes) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 86727c9d1e1..030cdc282bf 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd -import warnings import xarray as xr import pytest @@ -996,16 +995,16 @@ def test_isel_dataarray(self): # Conflict in the dimension coordinate indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], coords={'dim2': np.random.randn(3)}) - with self.assertRaisesRegexp(IndexError, "Dimension coordinate dim2"): + with self.assertRaisesRegexp( + IndexError, "dimension coordinate 'dim2'"): actual = data.isel(dim2=indexing_da) # Also the case for DataArray - with self.assertRaisesRegexp(IndexError, "Dimension coordinate dim2"): + with self.assertRaisesRegexp( + IndexError, "dimension coordinate 'dim2'"): actual = data['var2'].isel(dim2=indexing_da) - - # isel for the coordinate variable. Should not attach the coordinate - actual = data['dim2'].isel(dim2=indexing_da) - self.assertDataArrayIdentical(actual, - data['dim2'].isel(dim2=np.arange(1, 4))) + with self.assertRaisesRegexp( + IndexError, "dimension coordinate 'dim2'"): + data['dim2'].isel(dim2=indexing_da) # same name coordinate which does not conflict indexing_da = DataArray(np.arange(1, 4), dims=['dim2'], @@ -1055,13 +1054,13 @@ def test_isel_dataarray(self): actual = data.isel(dim2=indexing_da) assert 'station' in actual actual = data.isel(dim2=indexing_da['station']) - assert 'station' not in actual + assert 'station' in actual # indexer generated from coordinates indexing_ds = Dataset({}, coords={'dim2': [0, 1, 2]}) - actual = data.isel(dim2=indexing_ds['dim2']) - expected = data.isel(dim2=[0, 1, 2]) - self.assertDatasetIdentical(actual, expected) + with self.assertRaisesRegexp( + IndexError, "dimension coordinate 'dim2'"): + actual = data.isel(dim2=indexing_ds['dim2']) def test_sel(self): data = create_test_data() @@ -1567,7 +1566,7 @@ def test_reindex_warning(self): ind = xr.DataArray([0.0, 1.0], dims=['dim2'], name='ind') with pytest.warns(None) as ws: data.reindex(dim2=ind) - assert len(ws) == 0 + assert len(ws) == 0 def test_reindex_variables_copied(self): data = create_test_data() diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 7c8af81baf8..f8268ea2d6d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -154,9 +154,9 @@ def test_lazily_indexed_array(self): for j in indexers: for k in indexers: if isinstance(j, np.ndarray) and j.dtype.kind == 'b': - j = np.arange(20) < 5 + j = np.arange(20) < 5 if isinstance(k, np.ndarray) and k.dtype.kind == 'b': - k = np.arange(30) < 5 + k = np.arange(30) < 5 expected = np.asarray(v[i, j, k]) for actual in [v_lazy[i, j, k], v_lazy[:, j, k][i], From 170abc515bfc7112c212032ab8cecd50804acdb6 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 10 Oct 2017 21:20:59 +0900 Subject: [PATCH 113/113] Better error-message for multiindex vectorized-selection. --- xarray/core/indexing.py | 14 +++++++++++--- xarray/tests/test_dataset.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 8758f6890cd..956e5e2d0ec 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -2,7 +2,7 @@ from __future__ import division from __future__ import print_function from datetime import timedelta -from collections import defaultdict +from collections import defaultdict, Hashable import numpy as np import pandas as pd @@ -136,8 +136,13 @@ def convert_label_indexer(index, label, index_name='', method=None, elif len(label) == index.nlevels and not is_nested_vals: indexer = index.get_loc(tuple((label[k] for k in index.names))) else: - indexer, new_index = index.get_loc_level(tuple(label.values()), - level=tuple(label.keys())) + for k, v in label.items(): + # index should be an item (i.e. Hashable) not an array-like + if not isinstance(v, Hashable): + raise ValueError('Vectorized selection is not ' + 'available along level variable: ' + k) + indexer, new_index = index.get_loc_level( + tuple(label.values()), level=tuple(label.keys())) elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex): if _is_nested_tuple(label): @@ -160,6 +165,9 @@ def convert_label_indexer(index, label, index_name='', method=None, elif label.dtype.kind == 'b': indexer = label else: + if isinstance(index, pd.MultiIndex) and label.ndim > 1: + raise ValueError('Vectorized selection is not available along ' + 'MultiIndex variable: ' + index_name) indexer = get_indexer_nd(index, label, method, tolerance) if np.any(indexer < 0): raise KeyError('not all values found in index %r' diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 030cdc282bf..91cee61cf53 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1188,6 +1188,16 @@ def test_sel_dataarray_mindex(self): assert actual_sel['x'].dims == ('z', ) self.assertDatasetIdentical(actual_isel, actual_sel) + # Vectorized indexing with level-variables raises an error + with self.assertRaisesRegexp(ValueError, 'Vectorized selection is '): + mds.sel(one=['a', 'b']) + + with self.assertRaisesRegexp(ValueError, 'Vectorized selection is ' + 'not available along MultiIndex variable:' + ' x'): + mds.sel(x=xr.DataArray([np.array(midx[:2]), np.array(midx[-2:])], + dims=['a', 'b'])) + def test_sel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1})