Review changes.

SciTools · Aug 20, 2019 · cdc23ab · cdc23ab
1 parent f35bbd5
commit cdc23ab
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 27 deletions.
diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
@@ -23,16 +23,12 @@
 from __future__ import (absolute_import, division, print_function)
 from six.moves import (filter, input, map, range, zip)  # noqa
 
-try:  # Python 3
-    from collections.abc import Iterable
-except ImportError:  # Python 2
-    from collections import Iterable
 from functools import wraps
 
 import dask
 import dask.array as da
-import dask.array.core
 import dask.config
+import dask.utils
 
 import numpy as np
 import numpy.ma as ma
@@ -101,27 +97,17 @@ def _optimum_chunksize(chunks, shape,
         "chunks = [c[0] for c in normalise_chunks('auto', ...)]".
 
     """
-    # Return chunks unchanged, for types of invocation we don't comprehend.
-    if (any(elem <= 0 for elem in shape) or
-            not isinstance(chunks, Iterable) or
-            len(chunks) != len(shape)):
-        # Don't modify chunks for special values like -1, (0,), 'auto',
-        # or if shape contains 0 or -1 (like raw landsea-mask data proxies).
-        return chunks
-
     # Set the chunksize limit.
     if limit is None:
         # Fetch the default 'optimal' chunksize from the dask config.
         limit = dask.config.get('array.chunk-size')
         # Convert to bytes
-        limit = da.core.parse_bytes(limit)
+        limit = dask.utils.parse_bytes(limit)
 
     point_size_limit = limit / dtype.itemsize
 
     # Create result chunks, starting with a copy of the input.
     result = list(chunks)
-    if shape is None:
-        shape = result[:]
 
     if np.prod(result) < point_size_limit:
         # If size is less than maximum, expand the chunks, multiplying later
@@ -155,32 +141,41 @@ def as_lazy_data(data, chunks=None, asarray=False):
 
     Args:
 
-    * data:
-        An array. This will be converted to a dask array.
+    * data (array-like):
+        An indexable object with 'shape', 'dtype' and 'ndim' properties.
+        This will be converted to a dask array.
 
     Kwargs:
 
-    * chunks:
-        Describes how the created dask array should be split up. Defaults to a
-        value first defined in biggus (being `8 * 1024 * 1024 * 2`).
-        For more information see
-        http://dask.pydata.org/en/latest/array-creation.html#chunks.
+    * chunks (list of int):
+        If present, a source chunk shape, e.g. for a chunked netcdf variable.
 
-    * asarray:
+    * asarray (bool):
         If True, then chunks will be converted to instances of `ndarray`.
         Set to False (default) to pass passed chunks through unchanged.
 
     Returns:
         The input array converted to a dask array.
 
+    .. note::
+        The result chunk size is a multiple of 'chunks', if given, up to the
+        dask default chunksize, i.e. `dask.config.get('array.chunk-size'),
+        or the full data shape if that is smaller.
+        If 'chunks' is not given, the result has chunks of the full data shape,
+        but reduced by a factor if that exceeds the dask default chunksize.
+
     """
     if chunks is None:
         # No existing chunks : Make a chunk the shape of the entire input array
         # (but we will subdivide it if too big).
         chunks = list(data.shape)
 
-    # Expand or reduce the basic chunk shape to an optimum size.
-    chunks = _optimum_chunksize(chunks, shape=data.shape, dtype=data.dtype)
+    # Adjust chunk size for better dask performance,
+    # NOTE: but only if no shape dimension is zero, so that we can handle the
+    # PPDataProxy of "raw" landsea-masked fields, which have a shape of (0, 0).
+    if all(elem > 0 for elem in data.shape):
+        # Expand or reduce the basic chunk shape to an optimum size.
+        chunks = _optimum_chunksize(chunks, shape=data.shape, dtype=data.dtype)
 
     if isinstance(data, ma.core.MaskedConstant):
         data = ma.masked_array(data.data, mask=data.mask)

diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py
@@ -510,8 +510,10 @@ def _get_cf_var_data(cf_var, filename):
                          netCDF4.default_fillvals[cf_var.dtype.str[1:]])
     proxy = NetCDFDataProxy(cf_var.shape, dtype, filename, cf_var.cf_name,
                             fill_value)
+    # Get the chunking specified for the variable : this is either a shape, or
+    # maybe the string "contiguous".
     chunks = cf_var.cf_data.chunking()
-    # Chunks can be an iterable, or `'contiguous'`.
+    # In the "contiguous" case, pass chunks=None to 'as_lazy_data'.
     if chunks == 'contiguous':
         chunks = None
     return as_lazy_data(proxy, chunks=chunks)

diff --git a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
@@ -137,6 +137,15 @@ def test_large_specific_chunk_passthrough(self):
                                     dtype=np.dtype('f4'))])
         self.assertEqual(result.shape, huge_test_shape)
 
+    def test_shapeless_data(self):
+        # Check that chunk optimisation is skipped if shape contains a zero.
+        limitcall_patch = self.patch('iris._lazy_data._optimum_chunksize')
+        test_shape = (2, 1, 0, 2)
+        data = self._dummydata(test_shape)
+        result = as_lazy_data(data, chunks=test_shape)
+        self.assertFalse(limitcall_patch.called)
+        self.assertEqual(result.shape, test_shape)
+
 
 if __name__ == '__main__':
     tests.main()