diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ae5499a46a7..8f9d46f5157 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -27,8 +27,7 @@ merge_data_and_coords) from .utils import (Frozen, SortedKeysDict, maybe_wrap_array, hashable, decode_numpy_dict_values, ensure_us_time_resolution) -from .variable import (Variable, as_variable, IndexVariable, - broadcast_variables) +from .variable import Variable, as_variable, IndexVariable, broadcast_variables from .pycompat import (iteritems, basestring, OrderedDict, integer_types, dask_array_type, range) from .options import OPTIONS @@ -576,21 +575,16 @@ def _replace_vars_and_dims(self, variables, coord_names=None, dims=None, return obj def _replace_indexes(self, indexes): + """ + Make some index_level to scalar_level. + indexes: mapping from dimension name to new index. + """ if not len(indexes): return self variables = self._variables.copy() for name, idx in indexes.items(): - variables[name] = IndexVariable(name, idx) - obj = self._replace_vars_and_dims(variables) - - # switch from dimension to level names, if necessary - dim_names = {} - for dim, idx in indexes.items(): - if not isinstance(idx, pd.MultiIndex) and idx.name != dim: - dim_names[dim] = idx.name - if dim_names: - obj = obj.rename(dim_names) - return obj + variables[name] = variables[name].reset_levels(idx.names) + return self._replace_vars_and_dims(variables) def copy(self, deep=False): """Returns a copy of this dataset. @@ -627,7 +621,7 @@ def _level_coords(self): for cname in self._coord_names: var = self.variables[cname] if var.ndim == 1: - level_names = var.to_index_variable().level_names + level_names = var.all_level_names if level_names is not None: dim, = var.dims level_coords.update({lname: dim for lname in level_names}) @@ -1127,10 +1121,7 @@ def isel(self, drop=False, **indexers): Dataset.isel_points DataArray.isel """ - invalid = [k for k in indexers if k not in self.dims] - if invalid: - raise ValueError("dimensions %r do not exist" % invalid) - + indexers = indexing.get_dim_pos_indexers(self, indexers) # all indexers should be int, slice or np.ndarrays indexers = [(k, (np.asarray(v) if not isinstance(v, integer_types + (slice,)) @@ -1607,6 +1598,9 @@ def expand_dims(self, dim, axis=None): If dim is already a scalar coordinate, it will be promoted to a 1D coordinate consisting of a single value. + If dim is a scalar-level of MultiIndex, this level is changed to + index-level. + Parameters ---------- dim : str or sequence of str. @@ -1629,6 +1623,12 @@ def expand_dims(self, dim, axis=None): if isinstance(dim, basestring): dim = [dim] + else: + dim = list(dim) + # scalars to converted to index-level + scalars = [d for d in dim if d in self._level_coords] + dim = [d for d in dim if d not in scalars] + if axis is not None and not isinstance(axis, (list, tuple)): axis = [axis] @@ -1653,7 +1653,8 @@ def expand_dims(self, dim, axis=None): variables = OrderedDict() for k, v in iteritems(self._variables): if k not in dim: - if k in self._coord_names: # Do not change coordinates + if k in self._coord_names: + # Do not change coordinates variables[k] = v else: result_ndim = len(v.dims) + len(axis) @@ -1682,6 +1683,13 @@ def expand_dims(self, dim, axis=None): # it will be promoted to a 1D coordinate with a single value. variables[k] = v.set_dims(k) + # Convert scalar-level of MultiIndex to index-level + for k, v in iteritems(self._variables): + if v.scalar_level_names is not None and len(scalars) > 0: + level_dims = [s for s in scalars if s in + v.scalar_level_names] + list(v.dims) + variables[k] = v.set_dims(level_dims) + return self._replace_vars_and_dims(variables, self._coord_names) def set_index(self, append=False, inplace=False, **indexes): @@ -1768,11 +1776,7 @@ def reorder_levels(self, inplace=False, **dim_order): replace_variables = {} for dim, order in dim_order.items(): coord = self._variables[dim] - index = coord.to_index() - if not isinstance(index, pd.MultiIndex): - raise ValueError("coordinate %r has no MultiIndex" % dim) - replace_variables[dim] = IndexVariable(coord.dims, - index.reorder_levels(order)) + replace_variables[dim] = coord.reorder_levels(dim, order) variables = self._variables.copy() variables.update(replace_variables) return self._replace_vars_and_dims(variables, inplace=inplace) @@ -1790,7 +1794,7 @@ def _stack_once(self, dims, new_dim): variables[name] = stacked_var else: variables[name] = var.copy(deep=False) - + # TODO move to IndexVariable method # consider dropping levels that are unused? levels = [self.get_index(dim) for dim in dims] if hasattr(pd, 'RangeIndex'): diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 0996ef91cd9..826435e1c24 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -214,18 +214,25 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True, return front_str + values_str -def _summarize_coord_multiindex(coord, col_width, marker): - first_col = pretty_print(u' %s %s ' % (marker, coord.name), col_width) - return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) +def _summarize_coord_multiindex(coord, col_width, marker, name=None): + name = name or coord.name + first_col = pretty_print(u' %s %s ' % (marker, name), col_width) + if len(coord.dims) == 0: + return u'%sMultiIndex' % (first_col) + else: + return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) def _summarize_coord_levels(coord, col_width, marker=u'-'): - relevant_coord = coord[:30] + if len(coord.dims) == 0: + relevant_coord = coord # scalar MultiIndex + else: + relevant_coord = coord[:30] return u'\n'.join( [_summarize_var_or_coord(lname, relevant_coord.get_level_variable(lname), col_width, marker=marker) - for lname in coord.level_names]) + for lname in coord.all_level_names]) def _not_remote(var): @@ -247,11 +254,11 @@ def summarize_coord(name, var, col_width): is_index = name in var.dims show_values = is_index or _not_remote(var) marker = u'*' if is_index else u' ' - if is_index: - coord = var.variable.to_index_variable() + if name in var.coords: + coord = var.variable if coord.level_names is not None: return u'\n'.join( - [_summarize_coord_multiindex(coord, col_width, marker), + [_summarize_coord_multiindex(coord, col_width, marker, name), _summarize_coord_levels(coord, col_width)]) return _summarize_var_or_coord(name, var, col_width, show_values, marker) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2ea9a225291..1210bdf8b7a 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -212,7 +212,6 @@ def convert_label_indexer(index, label, index_name='', method=None, indexer, new_index = index.get_loc_level( label, level=list(range(len(label))) ) - else: label = _asarray_tuplesafe(label) if label.ndim == 0: @@ -230,6 +229,34 @@ def convert_label_indexer(index, label, index_name='', method=None, return indexer, new_index +def get_dim_pos_indexers(data_obj, indexers): + """ + Given a xarray data object and position based indexers, return a mapping + of position-indexers with only dimension names as keys. + It also checks if the multiIndex has only a single levels. + """ + invalid = [k for k in indexers + if k not in data_obj.dims and k not in data_obj._level_coords] + if invalid: + raise ValueError("dimensions or multi-index levels %r do not exist" + % invalid) + + dim_indexers = {} + for key, label in iteritems(indexers): + dim, = data_obj[key].dims + if key != dim: + # assume here multi-index level indexer + if len(data_obj.variables[dim].level_names) == 1: + # valid only for 1 level case. + dim_indexers[dim] = label + else: + raise ValueError('positional indexer for multi-level' + ' MultiIndex is not supported.') + else: + dim_indexers[key] = label + return dim_indexers + + def get_dim_indexers(data_obj, indexers): """Given a xarray data object and label based indexers, return a mapping of label indexers with only dimension names as keys. @@ -498,7 +525,9 @@ class PandasIndexAdapter(utils.NDArrayMixin): def __init__(self, array, dtype=None): self.array = utils.safe_cast_to_index(array) if dtype is None: - if isinstance(array, pd.PeriodIndex): + if array is None: + self._dtype = None + elif isinstance(array, pd.PeriodIndex): dtype = np.dtype('O') elif hasattr(array, 'categories'): # category isn't a real numpy dtype @@ -557,6 +586,195 @@ def __getitem__(self, key): return result + def take(self, indices): + return type(self)(self.array.take(indices), dtype=self.dtype) + + @classmethod + def concatenate(self, arrays): + """ Concatenate indexes. """ + return arrays[0].array.append([idx.array for idx in arrays[1:]]) + + def __eq__(self, other): + return self.array.equals(other.array) + def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) + + +class PandasMultiIndexAdapter(PandasIndexAdapter): + """ + An extension for MultiIndex, which keeps self._scalars indicating the + level names which should be treated as a scalar. + """ + def __init__(self, array, dtype=None, scalars=[]): + super(PandasMultiIndexAdapter, self).__init__(array, dtype) + # If array is 0-dimensional, scalars argument necessary because + # tuple does not have level names + if array.ndim == 0 and len(scalars) == 0: + raise ValueError('Name of levels is necessary for 0d-array input.') + + if isinstance(self.array, pd.MultiIndex): + for s in scalars: + if s not in self.all_levels: + raise ValueError('scalar %s is not a valid level name.' + % s) + self._scalars = scalars + + @property + def ndim(self): + return 1 if isinstance(self.array, pd.MultiIndex) else 0 + + @property + def shape(self): + if isinstance(self.array, pd.MultiIndex): + return (len(self.array),) + else: + return () + + @property + def all_levels(self): + """ All level names including scalars""" + # scalar case + if not isinstance(self.array, pd.MultiIndex): + return self.scalars + return self.array.names + + @property + def levels(self): + """ Level names except for scalars""" + level_names = list(self.all_levels) + for s in self.scalars: + level_names.remove(s) + return level_names + + @property + def scalars(self): + return self._scalars + + def __getitem__(self, key): + if isinstance(key, tuple) and len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + key, = key + + result = self.array[key] + if isinstance(result, tuple): # if a single item is chosen + result = utils.to_0d_object_array(result) + return PandasMultiIndexAdapter(result, dtype=self.dtype, + scalars=self.all_levels) + return PandasMultiIndexAdapter(result, dtype=self.dtype, + scalars=self.scalars) + + def get_level_values(self, level): + """ + Return an index for level-index. In scalar case, return the first item. + """ + if level in self.scalars: + if not isinstance(self.array, pd.MultiIndex): + return self.array.item()[self.scalars.index(level)] + else: + return self.array.get_level_values(level)[0] + elif level in self.levels: + return self.array.get_level_values(level) + else: + raise ValueError('level %r does not exist.' % level) + + def set_scalar(self, scalars): + invalid = [s for s in scalars if s not in self.levels] + if invalid: + raise ValueError('scalars %s is not a valid level name.' % invalid) + # keep scalars in order + new_scalars = [] + for l in self.levels: + if l in self._scalars: + new_scalars.append(l) + elif l in scalars: + new_scalars.append(l) + # if all the lebels become scalar, reduce to size 1 + if set(new_scalars) == set(self.all_levels): + type(self)(np.array(self.array[0]), self.dtype, new_scalars) + return type(self)(self.array, self.dtype, new_scalars) + + def reset_scalar(self, scalars): + if len(scalars) == 0: + return self + + level_names = self.all_levels + if not isinstance(self.array, pd.MultiIndex): + # in 0d-case, make MultiIndex from a tuple + invalid = [s for s in scalars if s not in self.scalars] + if invalid: + raise ValueError('scalar %s is not a valid level name.' + % invalid) + array = pd.MultiIndex.from_tuples([self.array.item()], + names=self.scalars) + else: + invalid = [s for s in scalars if s not in level_names] + if invalid: + raise ValueError('scalar %s is not a valid level name.' + % invalid) + else: + array = self.array + + new_scalars = list(self.scalars) + for s in scalars: + new_scalars.remove(s) + return type(self)(array, self.dtype, new_scalars) + + def to_1dIndex(self): + """ Convert to size-1 index if self.array is a scalar (0d-array). """ + if not isinstance(self.array, pd.MultiIndex): + return type(self)(pd.MultiIndex.from_tuples([self.array.item()], + names=self.scalars), + scalars=self.scalars) + else: + return self + + def __eq__(self, other): + return self.array.equals(other.array) and self.scalars == other.scalars + + def __repr__(self): + return ('%s(array=%r, dtype=%r, scalars=%r)' + % (type(self).__name__, self.array, self.dtype, self.scalars)) + + def take(self, indices): + return type(self)(self.array.take(indices), dtype=self.dtype, + scalars=self.scalars) + + @classmethod + def concatenate(cls, arrays): + """ + Concatenate multiple PandasMultiIndexAdapters. + arrays: a list of PandasMultiIndexAdapter + """ + # make sure all the levels are the same. + first_arr = arrays[0] + if any([first_arr.all_levels != arr.all_levels for arr in arrays]): + raise ValueError('Levels are not identical.') + + idx = first_arr.to_1dIndex().array + for arr in arrays[1:]: + idx = pd.MultiIndex.append(idx, arr.to_1dIndex().array) + idx = idx.set_names(first_arr.all_levels) + # automatically reset scalars + scalars = [s for s in first_arr.scalars if + all([first_arr.get_level_values(s) == + arr.get_level_values(s) for arr in arrays])] + return cls(idx, scalars=scalars) + + def reorder_levels(self, order): + """ Rearrange index levels using input order. Order should be a list + of valid level names. """ + # TODO support renaming? + if set(order) != set(self.all_levels): + raise ValueError('Order should be an exisiting level names.') + + if not isinstance(self.array, pd.MultiIndex): # scalar case + item = tuple([self.array.item()[self.scalars.index(o)] + for o in order]) + return type(self)(item, dtype=self.dtype, scalars=order) + else: + scalars = [o for o in order if o in self.scalars] + return type(self)(self.array.reorder_levels(order), + dtype=self.dtype, scalars=scalars) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 34b86275374..03a89d0e98f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -18,7 +18,8 @@ from . import utils from .pycompat import (basestring, OrderedDict, zip, integer_types, dask_array_type) -from .indexing import (PandasIndexAdapter, orthogonally_indexable) +from .indexing import (PandasIndexAdapter, PandasMultiIndexAdapter, + orthogonally_indexable) import xarray as xr # only for Dataset and DataArray @@ -95,7 +96,9 @@ def _maybe_wrap_data(data): NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should all pass through unmodified. """ - if isinstance(data, pd.Index): + if isinstance(data, pd.MultiIndex): + return PandasMultiIndexAdapter(data) + elif isinstance(data, pd.Index): return PandasIndexAdapter(data) return data @@ -263,6 +266,8 @@ def _in_memory(self): def data(self): if isinstance(self._data, dask_array_type): return self._data + elif isinstance(self._data, PandasMultiIndexAdapter): + return self._data else: return self.values @@ -523,6 +528,7 @@ def chunk(self, chunks=None, name=None, lock=False): def isel(self, **indexers): """Return a new array indexed along the specified dimension(s). + This also supports indexing along index-levels in case with MultiIndex. Parameters ---------- @@ -721,6 +727,11 @@ def set_dims(self, dims, shape=None): When possible, this operation does not copy this variable's data. + In MultiIndex-case, it can be used also for changing the scalar-level + to index-level. + In this case, dims should contains the dimension name (self.dims) as + well as the scalar-level-names to be changed. + Parameters ---------- dims : str or sequence of str or dict @@ -751,6 +762,22 @@ def set_dims(self, dims, shape=None): # don't use broadcast_to unless necessary so the result remains # writeable if possible expanded_data = self.data + + elif isinstance(self._data, PandasMultiIndexAdapter): + # if a scalar MultiIndex case, it is expanded to size-1 1d-index. + # In this case, all the scalars are set to levels + # TODO API decision needed + if len(self.level_names) == 0: + expanded_data = self.data.to_1dIndex().reset_scalar( + self.scalar_level_names) + # Otherwise, scalars in dims are changed to levels + else: + expanded_dims = self.name + expanded_data = self.data.reset_scalar( + [d for d in dims if d in self.data.scalars]) + return Variable(self.dims[0], expanded_data, self._attrs, + self._encoding, fastpath=True) + elif shape is not None: dims_map = dict(zip(dims, shape)) tmp_shape = tuple(dims_map[d] for d in expanded_dims) @@ -974,7 +1001,11 @@ def concat(cls, variables, dim='concat_dim', positions=None, if dim in first_var.dims: axis = first_var.get_axis_num(dim) dims = first_var.dims - data = duck_array_ops.concatenate(arrays, axis=axis) + # TODO Need to remove duplicates at below and IndexVariable.concat + if isinstance(first_var._data, PandasMultiIndexAdapter): + data = PandasMultiIndexAdapter.concatenate(arrays) + else: + data = duck_array_ops.concatenate(arrays, axis=axis) if positions is not None: # TODO: deprecate this option -- we don't need it for groupby # any more. @@ -984,7 +1015,11 @@ def concat(cls, variables, dim='concat_dim', positions=None, else: axis = 0 dims = (dim,) + first_var.dims - data = duck_array_ops.stack(arrays, axis=axis) + # We need this to reconstruct the MultiIndex. + if isinstance(first_var._data, PandasMultiIndexAdapter): + data = PandasMultiIndexAdapter.concatenate(arrays) + else: + data = duck_array_ops.stack(arrays, axis=axis) attrs = OrderedDict(first_var.attrs) if not shortcut: @@ -1160,6 +1195,47 @@ def func(self, other): return self return func + @property + def all_level_names(self): + """Return MultiIndex level names including scalar levels. + Return None if this IndexVariable has no MultiIndex. + """ + if isinstance(self._data, PandasMultiIndexAdapter): + return self._data.all_levels + else: + return None + + @property + def scalar_level_names(self): + if isinstance(self._data, PandasMultiIndexAdapter): + return self._data.scalars + else: + return None + + @property + def level_names(self): + if isinstance(self._data, PandasMultiIndexAdapter): + return self._data.levels + else: + return None + + def get_level_variable(self, level): + """Return a new IndexVariable from a given MultiIndex level.""" + if self.all_level_names is None: + raise ValueError("IndexVariable %r has no MultiIndex" % self.name) + + level = self._data.get_level_values(level) + if isinstance(level, pd.Index): + return type(self)(self.dims, level) + else: # scalar case + return Variable((), level) + + def reorder_levels(self, dim, order): + if not isinstance(self._data, PandasMultiIndexAdapter): + raise ValueError("coordinate %r has no MultiIndex" % dim) + return Variable(self.dims, self._data.reorder_levels(order)) + + ops.inject_all_ops_and_reduce_methods(Variable) @@ -1173,17 +1249,30 @@ class IndexVariable(Variable): They also have a name property, which is the name of their sole dimension unless another name is given. """ - def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): super(IndexVariable, self).__init__(dims, data, attrs, encoding, fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) - # Unlike in Variable, always eagerly load values into memory if not isinstance(self._data, PandasIndexAdapter): - self._data = PandasIndexAdapter(self._data) + if isinstance(self._data, pd.MultiIndex): + self._data = PandasMultiIndexAdapter(self._data) + else: + self._data = PandasIndexAdapter(self._data) + # automatic naming + if isinstance(self._data, PandasMultiIndexAdapter): + valid_level_names = [name or '{}_level_{}'.format(self.dims[0], i) + for i, name + in enumerate(self.all_level_names)] + self._data.array = self._data.array.set_names(valid_level_names) + else: + self._data.array = self._data.array.set_names(self.name) + + @property + def is_multiindex(self): + return isinstance(self._data, PandasMultiIndexAdapter) def load(self): # data is already loaded into memory for IndexVariable @@ -1229,12 +1318,12 @@ def concat(cls, variables, dim='concat_dim', positions=None, raise TypeError('IndexVariable.concat requires that all input ' 'variables be IndexVariable objects') - indexes = [v._data.array for v in variables] + indexes = [v._data for v in variables] if not indexes: data = [] else: - data = indexes[0].append(indexes[1:]) + data = indexes[0].concatenate(indexes) if positions is not None: indices = nputils.inverse_permutation( @@ -1274,7 +1363,8 @@ def equals(self, other, equiv=None): return False def _data_equals(self, other): - return self.to_index().equals(other.to_index()) + # note: other can be a Variable. + return self._data == other.to_index_variable()._data def to_index_variable(self): """Return this variable as an xarray.IndexVariable""" @@ -1287,34 +1377,7 @@ def to_index(self): # n.b. creating a new pandas.Index from an old pandas.Index is # basically free as pandas.Index objects are immutable assert self.ndim == 1 - index = self._data.array - if isinstance(index, pd.MultiIndex): - # set default names for multi-index unnamed levels so that - # we can safely rename dimension / coordinate later - valid_level_names = [name or '{}_level_{}'.format(self.dims[0], i) - for i, name in enumerate(index.names)] - index = index.set_names(valid_level_names) - else: - index = index.set_names(self.name) - return index - - @property - def level_names(self): - """Return MultiIndex level names or None if this IndexVariable has no - MultiIndex. - """ - index = self.to_index() - if isinstance(index, pd.MultiIndex): - return index.names - else: - return None - - def get_level_variable(self, level): - """Return a new IndexVariable from a given MultiIndex level.""" - if self.level_names is None: - raise ValueError("IndexVariable %r has no MultiIndex" % self.name) - index = self.to_index() - return type(self)(self.dims, index.get_level_values(level)) + return self._data.array @property def name(self): @@ -1324,6 +1387,17 @@ def name(self): def name(self, value): raise AttributeError('cannot modify name of IndexVariable in-place') + def reset_levels(self, level_names): + """ + level_names: a list of names to be kept as index_level + """ + if isinstance(self._data, PandasMultiIndexAdapter): + scalars = [name for name in self.all_level_names + if name not in level_names] + return type(self)(self.dims, self._data.set_scalar(scalars), + self._attrs, self._encoding, fastpath=True) + raise ValueError('Indexes conflict.') + # for backwards compatibility Coordinate = utils.alias(IndexVariable, 'Coordinate') @@ -1430,11 +1504,9 @@ def assert_unique_multiindex_level_names(variables): """ level_names = defaultdict(list) for var_name, var in variables.items(): - if isinstance(var._data, PandasIndexAdapter): - idx_level_names = var.to_index_variable().level_names - if idx_level_names is not None: - for n in idx_level_names: - level_names[n].append('%r (%s)' % (n, var_name)) + if isinstance(var._data, PandasMultiIndexAdapter): + for n in var.to_index_variable().to_index().names: + level_names[n].append('%r (%s)' % (n, var_name)) for k, v in level_names.items(): if k in variables: diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 92f616b8bd6..6e6e0e59dd5 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -9,6 +9,7 @@ from xarray import Variable, DataArray, Dataset import xarray.ufuncs as xu from xarray.core.pycompat import suppress +from xarray.core.indexing import PandasMultiIndexAdapter from . import TestCase, requires_dask from xarray.tests import unittest @@ -32,7 +33,8 @@ def assertLazyAnd(self, expected, actual, test): self.assertIsInstance(actual.data, da.Array) for k, v in actual.coords.items(): if k in actual.dims: - self.assertIsInstance(v.data, np.ndarray) + self.assertIsInstance(v.data, (np.ndarray, + PandasMultiIndexAdapter)) else: self.assertIsInstance(v.data, da.Array) elif isinstance(actual, Variable): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 608996003b6..68c9b499253 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -149,6 +149,21 @@ def test_repr_multiindex(self): print(actual) self.assertEqual(expected, actual) + # with scalar multiindex + data = data.sel(level_1='a') + expected = dedent("""\ + + Dimensions: (x: 2) + Coordinates: + * x (x) MultiIndex + - level_1 %s 'a' + - level_2 (x) int64 1 2 + Data variables: + *empty*""" % np.asarray('a').dtype) + actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) + print(actual) + self.assertEqual(expected, actual) + # verify that long level names are not truncated mindex = pd.MultiIndex.from_product( [['a', 'b'], [1, 2]], @@ -167,10 +182,33 @@ def test_repr_multiindex(self): print(actual) self.assertEqual(expected, actual) + def test_repr_scalar_multiindex(self): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('level_1', 'level_2')) + coords = OrderedDict() + coords['y'] = np.arange(4) + coords['z'] = mindex + data = Dataset({'x': (('y'), np.ones(4))}, coords=coords) + data = data.isel(z=0) # scalar multiindex + expected = dedent("""\ + + Dimensions: (y: 4) + Coordinates: + * y (y) {0} 0 1 2 3 + z MultiIndex + - level_1 {1} 'a' + - level_2 int64 1 + Data variables: + x (y) float64 1.0 1.0 1.0 1.0""".format( + np.asarray(1).dtype, np.asarray('a').dtype, np.asarray(1).dtype)) + actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) + print(actual) + self.assertEqual(expected, actual) + def test_repr_period_index(self): data = create_test_data(seed=456) - data.coords['time'] = pd.period_range('2000-01-01', periods=20, freq='B') - + data.coords['time'] = pd.period_range('2000-01-01', periods=20, + freq='B') # check that creating the repr doesn't raise an error #GH645 repr(data) @@ -1059,22 +1097,22 @@ def test_selection_multiindex(self): coords={'x': mindex}) def test_sel(lab_indexer, pos_indexer, replaced_idx=False, - renamed_dim=None): + scalared_dim=None): ds = mdata.sel(x=lab_indexer) expected_ds = mdata.isel(x=pos_indexer) if not replaced_idx: self.assertDatasetIdentical(ds, expected_ds) else: - if renamed_dim: - self.assertEqual(ds['var'].dims[0], renamed_dim) - ds = ds.rename({renamed_dim: 'x'}) + if scalared_dim: + self.assertTrue(scalared_dim in + ds['x'].variable.level_names) self.assertVariableIdentical(ds['var'].variable, expected_ds['var'].variable) self.assertVariableNotEqual(ds['x'], expected_ds['x']) test_sel(('a', 1, -1), 0) test_sel(('b', 2, -2), -1) - test_sel(('a', 1), [0, 1], replaced_idx=True, renamed_dim='three') + test_sel(('a', 1), [0, 1], replaced_idx=True, scalared_dim='three') test_sel(('a',), range(4), replaced_idx=True) test_sel('a', range(4), replaced_idx=True) test_sel([('a', 1, -1), ('b', 2, -2)], [0, 7]) @@ -1082,7 +1120,7 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, test_sel(slice(('a', 1), ('b', 1)), range(6)) test_sel({'one': 'a', 'two': 1, 'three': -1}, 0) test_sel({'one': 'a', 'two': 1}, [0, 1], replaced_idx=True, - renamed_dim='three') + scalared_dim='three') test_sel({'one': 'a'}, range(4), replaced_idx=True) self.assertDatasetIdentical(mdata.loc[{'x': {'one': 'a'}}], @@ -1097,6 +1135,63 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, self.assertDatasetIdentical(mdata.sel(x={'one': 'a', 'two': 1}), mdata.sel(one='a', two=1)) + def test_isel_multiindex(self): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = Dataset(data_vars={'var': ('x', range(8))}, + coords={'x': mindex}) + with self.assertRaises(ValueError): + mdata.isel(one=0) + mdata2 = mdata.isel(x=[0, 1]).sel(one='a', two=1) + mdata3 = mdata.sel(one='a', two=1).isel(three=[0, 1]) + self.assertDatasetIdentical(mdata2, mdata3) + + mdata = xr.Dataset({'foo': (('x', 'y'), np.random.randn(3, 4))}, + {'x': ['a', 'b', 'c'], 'y': [1, 2, 3, 4]}) + mdata = mdata.stack(space=['x', 'y']) + mdata2 = mdata.isel(space=[0, 1]).sel(x='a') + mdata3 = mdata.sel(x='a').isel(y=[0, 1]) + self.assertDatasetIdentical(mdata2, mdata3) + # 1 single element is chosen. + self.assertTrue('space' in mdata.isel(space=0).coords) + + def test_concat_multiindex(self): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = Dataset(data_vars={'var': ('x', range(8))}, + coords={'x': mindex}) + + actual = xr.concat([mdata.sel(one='a'), mdata.sel(one='b')], dim='x') + self.assertDatasetIdentical(actual, mdata) + # scalar multiindex. + actual = xr.concat([mdata.isel(x=i) for i in range(len(mdata['x']))], + dim='x') + self.assertDatasetIdentical(actual, mdata) + + def test_multiindex_expand_dims(self): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = Dataset(data_vars={'var': ('x', range(8))}, + coords={'x': mindex}) + actual = mdata.isel(x=0).expand_dims('x') + expected = mdata.isel(x=[0]) + self.assertDatasetIdentical(actual, expected) + self.assertTrue(actual.variables['x'].level_names == + expected.variables['x'].level_names) + + # expand scalar-level + actual = mdata.sel(one='a').expand_dims('one') + expected = mdata.isel(x=[0, 1, 2, 3]) + self.assertDatasetIdentical(actual, expected) + self.assertTrue(actual.variables['x'].level_names == + expected.variables['x'].level_names) + + actual = mdata.sel(one='a', two=2).expand_dims(['one', 'two']) + expected = mdata.isel(x=[2, 3]) + self.assertDatasetIdentical(actual, expected) + self.assertTrue(actual.variables['x'].level_names == + expected.variables['x'].level_names) + def test_reindex_like(self): data = create_test_data() data['letters'] = ('dim3', 10 * ['a']) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 79e841e0f3b..83e11e76e97 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -6,6 +6,7 @@ from xarray import Dataset, DataArray, Variable from xarray.core import indexing +from xarray.core import variable from . import TestCase, ReturnItem @@ -244,3 +245,79 @@ def test_setitem(self): wrapped = indexing.MemoryCachedArray(original) wrapped[:] = 0 self.assertArrayEqual(original, np.zeros(10)) + + +class TestPandasMultiIndexAdapter(TestCase): + def test_multi(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + index = indexing.PandasMultiIndexAdapter(idx) + self.assertTrue(index.scalars == []) + + def test_0d(self): + array = variable.as_compatible_data(('a', 0)) + with self.assertRaises(ValueError): + index = indexing.PandasMultiIndexAdapter(array) + + index = indexing.PandasMultiIndexAdapter(array, + scalars=['l1', 'l2']) + self.assertTrue(index.scalars == ['l1', 'l2']) + # levels + self.assertTrue(index.levels == []) + + actual = index.reset_scalar(['l1']) + idx = pd.MultiIndex.from_tuples([('a', 0)]) + idx = idx.set_names(['l1', 'l2']) + expected = indexing.PandasMultiIndexAdapter(idx, scalars=['l2']) + self.assertArrayEqual(actual, expected) + + def test_getitem(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + index = indexing.PandasMultiIndexAdapter(idx) + self.assertTrue(all(index.get_level_values('level_1') == + idx.get_level_values('level_1'))) + + index = index.set_scalar(['level_2']) + # indexing should keep scalars + self.assertTrue(index[0].scalars == ['level_1', 'level_2']) + self.assertTrue(index[np.array([0, 1])].scalars == ['level_2']) + + def test_get_level_values(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + index = indexing.PandasMultiIndexAdapter(idx) + self.assertTrue(all(index.get_level_values('level_1') == + idx.get_level_values('level_1'))) + # set scalar + index = index.set_scalar(['level_2']) + self.assertTrue(index.get_level_values('level_2') == + idx.get_level_values('level_2')[0]) + self.assertTrue(all(index.get_level_values('level_1') == + idx.get_level_values('level_1'))) + # reset scalar + index = index.reset_scalar(['level_2']) + self.assertTrue(all(index.get_level_values('level_2') == + idx.get_level_values('level_2'))) + self.assertTrue(all(index.get_level_values('level_1') == + idx.get_level_values('level_1'))) + # set scalar and reduce to 1 element + index = index.set_scalar(['level_1', 'level_2']) + self.assertTrue(index.get_level_values('level_2') == + idx.get_level_values('level_2')[0]) + # restore the size 1 MultiIndex + index = index.reset_scalar(['level_1', 'level_2']) + self.assertTrue(index.get_level_values('level_2')[0] == + idx.get_level_values('level_2')[0]) + # works even if 1-item case. + index = indexing.PandasMultiIndexAdapter(idx) + index1 = index[0] + self.assertTrue(index1.get_level_values('level_2') == + index.get_level_values('level_2')[0]) + + def test_eq(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + index = indexing.PandasMultiIndexAdapter(idx) + self.assertTrue(index == index) + self.assertTrue(index != index.set_scalar(['level_1'])) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index aa061516949..ff25d58f85a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -12,10 +12,11 @@ import pytz import pandas as pd -from xarray import Variable, IndexVariable, Coordinate, Dataset -from xarray.core import indexing +from xarray import (Variable, IndexVariable, Coordinate, Dataset) +from xarray.core import indexing, utils from xarray.core.variable import as_variable, as_compatible_data -from xarray.core.indexing import PandasIndexAdapter, LazilyIndexedArray +from xarray.core.indexing import (PandasIndexAdapter, PandasMultiIndexAdapter, + LazilyIndexedArray) from xarray.core.pycompat import PY3, OrderedDict from xarray.core.common import full_like, zeros_like, ones_like @@ -410,15 +411,21 @@ def test_copy(self): source_ndarray(w.values)) self.assertVariableIdentical(v, copy(v)) + def test_to_index_variable(self): + v = self.cls('x', 0.5 * np.arange(10), {'foo': 'bar'}) + index = v.to_index_variable() + self.assertTrue(isinstance(index, self.cls)) + def test_copy_index(self): midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], names=('one', 'two', 'three')) - v = self.cls('x', midx) - for deep in [True, False]: - w = v.copy(deep=deep) - self.assertIsInstance(w._data, PandasIndexAdapter) - self.assertIsInstance(w.to_index(), pd.MultiIndex) - self.assertArrayEqual(v._data.array, w._data.array) + if self.cls is not IndexVariable: + v = self.cls('x', midx) + for deep in [True, False]: + w = v.copy(deep=deep) + self.assertIsInstance(w._data, PandasIndexAdapter) + self.assertIsInstance(w.to_index(), pd.MultiIndex) + self.assertArrayEqual(v._data.array, w._data.array) def test_real_and_imag(self): v = self.cls('x', np.arange(3) - 1j * np.arange(3), {'foo': 'bar'}) @@ -453,12 +460,6 @@ def test_pandas_datetime64_with_tz(self): # pandas is new enough that it has datetime64 with timezone dtype assert v.dtype == 'object' - def test_multiindex(self): - idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) - v = self.cls('x', idx) - self.assertVariableIdentical(Variable((), ('a', 0)), v[0]) - self.assertVariableIdentical(v, v[:]) - def test_load(self): array = self.cls('x', np.arange(5)) orig_data = array._data @@ -468,6 +469,52 @@ def test_load(self): assert type(copied._data) is type(orig_data) self.assertVariableIdentical(array, copied) + def test_multiindex(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + v = self.cls('x', idx) + idx_new = PandasMultiIndexAdapter(utils.to_0d_object_array(('a', 0)), + scalars=['level_1', 'level_2']) + self.assertVariableIdentical(Variable((), idx_new), v[0]) + # 1 element MultiIndex should have `scalar_level_names` + self.assertTrue(v[0].scalar_level_names == ['level_1', 'level_2']) + self.assertVariableIdentical(v, v[:]) + + def test_multiindex_concat(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + v = self.cls('x', idx) + concat = self.cls.concat([v[:3], v[3:]], dim='x') + self.assertTrue(isinstance(concat._data, PandasMultiIndexAdapter)) + self.assertVariableIdentical(v, concat) + # even works with 1 item + self.assertVariableIdentical(v, self.cls.concat([v], dim='x')) + + # test only for Variable since IndexVariable cannot store 0d-index. + if self.cls != IndexVariable: + # with scalar MultiIndex. + concat = v[0].concat([v[i] for i in range(len(v))], dim='x') + self.assertVariableIdentical(v, concat) + # make sure automatically reset scalars + concat = v[0].concat([v[i] for i in range(2)], dim='x') + self.assertTrue(concat.scalar_level_names == ['level_1']) + + def test_reorder_levels(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = idx.set_names(['level_1', 'level_2']) + v = self.cls('x', idx) + v2 = v.reorder_levels('x', ['level_1', 'level_2']) + self.assertVariableIdentical(v.to_index_variable(), + v2.to_index_variable()) + v2 = v.reorder_levels('x', ['level_2', 'level_1']) + self.assertTrue(v2.all_level_names == ['level_2', 'level_1']) + v = v.to_index_variable() + v2 = v2.to_index_variable() + self.assertVariableIdentical(v.get_level_variable('level_2'), + v2.get_level_variable('level_2')) + self.assertVariableIdentical(v.get_level_variable('level_1'), + v2.get_level_variable('level_1')) + class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) @@ -637,6 +684,11 @@ def test_as_variable(self): expected = Variable([], 0) self.assertVariableIdentical(expected, actual) + def test_to_index_variable(self): + midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) + v = Variable(['x'], midx, {'foo': 'bar'}) + self.assertTrue(isinstance(v.to_index_variable(), IndexVariable)) + def test_repr(self): v = Variable(['time', 'x'], [[1, 2, 3], [4, 5, 6]], {'foo': 'bar'}) expected = dedent(""" @@ -1068,19 +1120,15 @@ class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) def test_init(self): - with self.assertRaisesRegexp(ValueError, 'must be 1-dimensional'): - IndexVariable((), 0) + with self.assertRaisesRegexp(ValueError, + 'must be 1-dimensional'): + IndexVariable(('a', 'b'), [[0, 1], [1, 2]]) def test_to_index(self): data = 0.5 * np.arange(10) v = IndexVariable(['time'], data, {'foo': 'bar'}) self.assertTrue(pd.Index(data, name='time').identical(v.to_index())) - def test_multiindex_default_level_names(self): - midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) - v = IndexVariable(['x'], midx, {'foo': 'bar'}) - self.assertEqual(v.to_index().names, ('x_level_0', 'x_level_1')) - def test_data(self): x = IndexVariable('x', np.arange(3.0)) self.assertIsInstance(x._data, PandasIndexAdapter) @@ -1098,27 +1146,10 @@ def test_name(self): with self.assertRaises(AttributeError): coord.name = 'y' - def test_level_names(self): - midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], - names=['level_1', 'level_2']) - x = IndexVariable('x', midx) - self.assertEqual(x.level_names, midx.names) - - self.assertIsNone(IndexVariable('y', [10.0]).level_names) - - def test_get_level_variable(self): - midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], - names=['level_1', 'level_2']) - x = IndexVariable('x', midx) - level_1 = IndexVariable('x', midx.get_level_values('level_1')) - self.assertVariableIdentical(x.get_level_variable('level_1'), level_1) - - with self.assertRaisesRegexp(ValueError, 'has no MultiIndex'): - IndexVariable('y', [10.0]).get_level_variable('level') - def test_concat_periods(self): periods = pd.period_range('2000-01-01', periods=10) - coords = [IndexVariable('t', periods[:5]), IndexVariable('t', periods[5:])] + coords = [IndexVariable('t', periods[:5]), + IndexVariable('t', periods[5:])] expected = IndexVariable('t', periods) actual = IndexVariable.concat(coords, dim='t') assert actual.identical(expected) @@ -1129,19 +1160,17 @@ def test_concat_periods(self): assert actual.identical(expected) assert isinstance(actual.to_index(), pd.PeriodIndex) - def test_concat_multiindex(self): - idx = pd.MultiIndex.from_product([[0, 1, 2], ['a', 'b']]) - coords = [IndexVariable('x', idx[:2]), IndexVariable('x', idx[2:])] - expected = IndexVariable('x', idx) - actual = IndexVariable.concat(coords, dim='x') - assert actual.identical(expected) - assert isinstance(actual.to_index(), pd.MultiIndex) - def test_coordinate_alias(self): with self.assertWarns('deprecated'): x = Coordinate('x', [1, 2, 3]) self.assertIsInstance(x, IndexVariable) + def test_equiv_multiindex(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx.set_names(['level_1', 'level_2']) + v = IndexVariable('x', idx) + v2 = v.reset_levels(['level_1']) + self.assertFalse(v.equals(v2)) class TestAsCompatibleData(TestCase):