Skip to content

Commit

Permalink
Load IndexVariable.data into memory in init
Browse files Browse the repository at this point in the history
IndexVariables to eagerly load their data into memory (from disk or dask) as soon as they're created
  • Loading branch information
gimperiale committed Nov 13, 2016
1 parent 27b0916 commit 376200a
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 25 deletions.
2 changes: 1 addition & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Breaking changes
:py:meth:`values` property, won't automatically convert the array from dask
to numpy in the original object anymore.
If a dask object is used as a coord of a :py:class:`~xarray.DataArray` or
:py:class:`~xarray.Dataset`, its values will still be automatically cached,
:py:class:`~xarray.Dataset`, its values are eagerly computed and cached,
but only if it's used to index a dim (e.g. it's used for alignment).
By `Guido Imperiale <https://github.com/crusaderky>`_.

Expand Down
3 changes: 0 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,9 +887,6 @@ def selkeys(dict_, keys):
return dict((d, dict_[d]) for d in keys if d in dict_)

def maybe_chunk(name, var, chunks):
if name in self.dims:
return var

chunks = selkeys(chunks, var.dims)
if not chunks:
chunks = None
Expand Down
21 changes: 12 additions & 9 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,16 +1100,19 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
raise ValueError('%s objects must be 1-dimensional' %
type(self).__name__)

def _data_cached(self):
# Unlike in Variable._data_cached, always eagerly resolve dask arrays
self._data = self._data_cast()
return self._data
# Unlike in Variable, always eagerly load values into memory
if not isinstance(self._data, PandasIndexAdapter):
self._data = PandasIndexAdapter(self._data)

def _data_cast(self):
if isinstance(self._data, PandasIndexAdapter):
return self._data
else:
return PandasIndexAdapter(self._data)
@Variable.data.setter
def data(self, data):
Variable.data.fset(self, data)
if not isinstance(self._data, PandasIndexAdapter):
self._data = PandasIndexAdapter(self._data)

def chunk(self, chunks=None, name=None, lock=False):
# Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk()
return self.copy(deep=False)

def __getitem__(self, key):
key = self._item_key_to_tuple(key)
Expand Down
14 changes: 10 additions & 4 deletions xarray/test/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,10 @@ def assert_loads(vars=None):
vars = expected
with self.roundtrip(expected) as actual:
for k, v in actual.variables.items():
# IndexVariables are eagerly cached into memory
if k not in actual.dims:
# IndexVariables are eagerly loaded into memory
if k in actual.dims:
self.assertTrue(v._in_memory)
else:
self.assertFalse(v._in_memory)
yield actual
for k, v in actual.variables.items():
Expand Down Expand Up @@ -161,13 +163,17 @@ def test_dataset_compute(self):
# Test Dataset.compute()
for k, v in actual.variables.items():
# IndexVariables are eagerly cached
if k not in actual.dims:
if k in actual.dims:
self.assertTrue(v._in_memory)
else:
self.assertFalse(v._in_memory)

computed = actual.compute()

for k, v in actual.variables.items():
if k not in actual.dims:
if k in actual.dims:
self.assertTrue(v._in_memory)
else:
self.assertFalse(v._in_memory)
for v in computed.variables.values():
self.assertTrue(v._in_memory)
Expand Down
21 changes: 17 additions & 4 deletions xarray/test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,24 @@ def create_test_multiindex():


class InaccessibleVariableDataStore(backends.InMemoryDataStore):
def __init__(self, writer=None):
super(InaccessibleVariableDataStore, self).__init__(writer)
self._indexvars = set()

def store(self, variables, attributes, check_encoding_set=frozenset()):
super(InaccessibleVariableDataStore, self).store(
variables, attributes, check_encoding_set)
for k, v in variables.items():
if isinstance(v, IndexVariable):
self._indexvars.add(k)

def get_variables(self):
def lazy_inaccessible(x):
data = indexing.LazilyIndexedArray(InaccessibleArray(x.values))
return Variable(x.dims, data, x.attrs)
return dict((k, lazy_inaccessible(v)) for
def lazy_inaccessible(k, v):
if k in self._indexvars:
return v
data = indexing.LazilyIndexedArray(InaccessibleArray(v.values))
return Variable(v.dims, data, v.attrs)
return dict((k, lazy_inaccessible(k, v)) for
k, v in iteritems(self._variables))


Expand Down
6 changes: 2 additions & 4 deletions xarray/test/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1052,13 +1052,11 @@ def test_multiindex_default_level_names(self):

def test_data(self):
x = IndexVariable('x', np.arange(3.0))
# data should be initially saved as an ndarray
self.assertIs(type(x._data), np.ndarray)
self.assertIsInstance(x._data, PandasIndexAdapter)
self.assertIsInstance(x.data, np.ndarray)
self.assertEqual(float, x.dtype)
self.assertArrayEqual(np.arange(3), x)
self.assertEqual(float, x.values.dtype)
# after inspecting x.values, the IndexVariable value will be saved as an Index
self.assertIsInstance(x._data, PandasIndexAdapter)
with self.assertRaisesRegexp(TypeError, 'cannot be modified'):
x[:] = 0

Expand Down

0 comments on commit 376200a

Please sign in to comment.