per-variable fill values (#4237)

* implement the fill_value mapping * get per-variable fill_values to work in DataArray.reindex * Update xarray/core/dataarray.py Co-authored-by: Stephan Hoyer <shoyer@google.com> * check that the default value is used * check that merge works with multiple fill values * check that concat works with multiple fill values * check that combine_nested works with multiple fill values * check that Dataset.reindex and DataArray.reindex work * check that aligning Datasets works * check that Dataset.unstack works * allow passing multiple fill values to full_like with datasets * also allow overriding the dtype by variable * document the dict fill values in Dataset.reindex * document the changes to DataArray.reindex * document the changes to unstack * document the changes to align * document the changes to concat and merge * document the changes to Dataset.shift * document the changes to combine_* Co-authored-by: Stephan Hoyer <shoyer@google.com>
pydata · Aug 24, 2020 · a36d0a1 · a36d0a1
1 parent 1a11d24
commit a36d0a1
Show file tree

Hide file tree

Showing 12 changed files with 284 additions and 81 deletions.
diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -103,8 +103,10 @@ def align(
         used in preference to the aligned indexes.
     exclude : sequence of str, optional
         Dimensions that must be excluded from alignment
-    fill_value : scalar, optional
-        Value to use for newly missing values
+    fill_value : scalar or dict-like, optional
+        Value to use for newly missing values. If a dict-like, maps
+        variable names to fill values. Use a data array's name to
+        refer to its values.
 
     Returns
     -------
@@ -581,16 +583,21 @@ def reindex_variables(
 
     for name, var in variables.items():
         if name not in indexers:
+            if isinstance(fill_value, dict):
+                fill_value_ = fill_value.get(name, dtypes.NA)
+            else:
+                fill_value_ = fill_value
+
             if sparse:
-                var = var._as_sparse(fill_value=fill_value)
+                var = var._as_sparse(fill_value=fill_value_)
             key = tuple(
                 slice(None) if d in unchanged_dims else int_indexers.get(d, slice(None))
                 for d in var.dims
             )
             needs_masking = any(d in masked_dims for d in var.dims)
 
             if needs_masking:
-                new_var = var._getitem_with_mask(key, fill_value=fill_value)
+                new_var = var._getitem_with_mask(key, fill_value=fill_value_)
             elif all(is_full_slice(k) for k in key):
                 # no reindexing necessary
                 # here we need to manually deal with copying data, since

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -393,8 +393,10 @@ def combine_nested(
         Details are in the documentation of concat
     coords : {"minimal", "different", "all" or list of str}, optional
         Details are in the documentation of concat
-    fill_value : scalar, optional
-        Value to use for newly missing values
+    fill_value : scalar or dict-like, optional
+        Value to use for newly missing values. If a dict-like, maps
+        variable names to fill values. Use a data array's name to
+        refer to its values.
     join : {"outer", "inner", "left", "right", "exact"}, optional
         String indicating how to combine differing indexes
         (excluding concat_dim) in objects
@@ -569,10 +571,12 @@ def combine_by_coords(
           addition to the "minimal" data variables.
 
         If objects are DataArrays, `data_vars` must be "all".
-    coords : {"minimal", "different", "all" or list of str}, optional
+    coords : {"minimal", "different", "all"} or list of str, optional
         As per the "data_vars" kwarg, but for coordinate variables.
-    fill_value : scalar, optional
-        Value to use for newly missing values. If None, raises a ValueError if
+    fill_value : scalar or dict-like, optional
+        Value to use for newly missing values. If a dict-like, maps
+        variable names to fill values. Use a data array's name to
+        refer to its values. If None, raises a ValueError if
         the passed Datasets do not create a complete hypercube.
     join : {"outer", "inner", "left", "right", "exact"}, optional
         String indicating how to combine differing indexes

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -1364,10 +1364,13 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
     ----------
     other : DataArray, Dataset or Variable
         The reference object in input
-    fill_value : scalar
-        Value to fill the new object with before returning it.
-    dtype : dtype, optional
-        dtype of the new array. If omitted, it defaults to other.dtype.
+    fill_value : scalar or dict-like
+        Value to fill the new object with before returning it. If
+        other is a Dataset, may also be a dict-like mapping data
+        variables to fill values.
+    dtype : dtype or dict-like of dtype, optional
+        dtype of the new array. If a dict-like, maps dtypes to
+        variables. If omitted, it defaults to other.dtype.
 
     Returns
     -------
@@ -1427,6 +1430,34 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
     * lat      (lat) int64 1 2
     * lon      (lon) int64 0 1 2
 
+    >>> ds = xr.Dataset(
+    ...     {"a": ("x", [3, 5, 2]), "b": ("x", [9, 1, 0])}, coords={"x": [2, 4, 6]}
+    ... )
+    >>> ds
+    <xarray.Dataset>
+    Dimensions:  (x: 3)
+    Coordinates:
+      * x        (x) int64 2 4 6
+    Data variables:
+        a        (x) int64 3 5 2
+        b        (x) int64 9 1 0
+    >>> xr.full_like(ds, fill_value={"a": 1, "b": 2})
+    <xarray.Dataset>
+    Dimensions:  (x: 3)
+    Coordinates:
+      * x        (x) int64 2 4 6
+    Data variables:
+        a        (x) int64 1 1 1
+        b        (x) int64 2 2 2
+    >>> xr.full_like(ds, fill_value={"a": 1, "b": 2}, dtype={"a": bool, "b": float})
+    <xarray.Dataset>
+    Dimensions:  (x: 3)
+    Coordinates:
+      * x        (x) int64 2 4 6
+    Data variables:
+        a        (x) bool True True True
+        b        (x) float64 2.0 2.0 2.0
+
     See also
     --------
 
@@ -1438,12 +1469,22 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
     from .dataset import Dataset
     from .variable import Variable
 
-    if not is_scalar(fill_value):
-        raise ValueError(f"fill_value must be scalar. Received {fill_value} instead.")
+    if not is_scalar(fill_value) and not (
+        isinstance(other, Dataset) and isinstance(fill_value, dict)
+    ):
+        raise ValueError(
+            f"fill_value must be scalar or, for datasets, a dict-like. Received {fill_value} instead."
+        )
 
     if isinstance(other, Dataset):
+        if not isinstance(fill_value, dict):
+            fill_value = {k: fill_value for k in other.data_vars.keys()}
+
+        if not isinstance(dtype, dict):
+            dtype = {k: dtype for k in other.data_vars.keys()}
+
         data_vars = {
-            k: _full_like_variable(v, fill_value, dtype)
+            k: _full_like_variable(v, fill_value.get(k, dtypes.NA), dtype.get(k, None))
             for k, v in other.data_vars.items()
         }
         return Dataset(data_vars, coords=other.coords, attrs=other.attrs)
@@ -1466,6 +1507,9 @@ def _full_like_variable(other, fill_value, dtype: DTypeLike = None):
     """
     from .variable import Variable
 
+    if fill_value is dtypes.NA:
+        fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype)
+
     if isinstance(other.data, dask_array_type):
         import dask.array
 

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -125,8 +125,10 @@ def concat(
         List of integer arrays which specifies the integer positions to which
         to assign each dataset along the concatenated dimension. If not
         supplied, objects are concatenated in the provided order.
-    fill_value : scalar, optional
-        Value to use for newly missing values
+    fill_value : scalar or dict-like, optional
+        Value to use for newly missing values. If a dict-like, maps
+        variable names to fill values. Use a data array's name to
+        refer to its values.
     join : {"outer", "inner", "left", "right", "exact"}, optional
         String indicating how to combine differing indexes
         (excluding dim) in objects

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1308,8 +1308,10 @@ def reindex_like(
             ``copy=False`` and reindexing is unnecessary, or can be performed
             with only slice operations, then the output may share memory with
             the input. In either case, a new xarray object is always returned.
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like, maps
+            variable names (including coordinates) to fill values. Use this
+            data array's name to refer to the data array's values.
 
         Returns
         -------
@@ -1368,8 +1370,10 @@ def reindex(
             Maximum distance between original and new labels for inexact
             matches. The values of the index at the matching locations must
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like, maps
+            variable names (including coordinates) to fill values. Use this
+            data array's name to refer to the data array's values.
         **indexers_kwargs : {dim: indexer, ...}, optional
             The keyword arguments form of ``indexers``.
             One of indexers or indexers_kwargs must be provided.
@@ -1386,6 +1390,13 @@ def reindex(
         align
         """
         indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "reindex")
+        if isinstance(fill_value, dict):
+            fill_value = fill_value.copy()
+            sentinel = object()
+            value = fill_value.pop(self.name, sentinel)
+            if value is not sentinel:
+                fill_value[_THIS_ARRAY] = value
+
         ds = self._to_temp_dataset().reindex(
             indexers=indexers,
             method=method,
@@ -1867,8 +1878,11 @@ def unstack(
         dim : hashable or sequence of hashable, optional
             Dimension(s) over which to unstack. By default unstacks all
             MultiIndexes.
-        fill_value : scalar, default: nan
-            value to be filled.
+        fill_value : scalar or dict-like, default: nan
+            value to be filled. If a dict-like, maps variable names to
+            fill values. Use the data array's name to refer to its
+            name. If not provided or if the dict-like does not contain
+            all variables, the dtype's NA value will be used.
         sparse : bool, default: False
             use sparse-array if True
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2313,8 +2313,9 @@ def reindex_like(
             ``copy=False`` and reindexing is unnecessary, or can be performed
             with only slice operations, then the output may share memory with
             the input. In either case, a new xarray object is always returned.
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like maps
+            variable names to fill values.
 
         Returns
         -------
@@ -2373,8 +2374,9 @@ def reindex(
             ``copy=False`` and reindexing is unnecessary, or can be performed
             with only slice operations, then the output may share memory with
             the input. In either case, a new xarray object is always returned.
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like,
+            maps variable names (including coordinates) to fill values.
         sparse : bool, default: False
             use sparse-array.
         **indexers_kwargs : {dim: indexer, ...}, optional
@@ -2441,6 +2443,19 @@ def reindex(
             temperature  (station) float64 18.84 0.0 19.22 0.0
             pressure     (station) float64 324.1 0.0 122.8 0.0
 
+        We can also use different fill values for each variable.
+
+        >>> x.reindex(
+        ...     {"station": new_index}, fill_value={"temperature": 0, "pressure": 100}
+        ... )
+        <xarray.Dataset>
+        Dimensions:      (station: 4)
+        Coordinates:
+        * station      (station) object 'boston' 'austin' 'seattle' 'lincoln'
+        Data variables:
+            temperature  (station) float64 18.84 0.0 19.22 0.0
+            pressure     (station) float64 324.1 100.0 122.8 100.0
+
         Because the index is not monotonically increasing or decreasing, we cannot use arguments
         to the keyword method to fill the `NaN` values.
 
@@ -3544,8 +3559,10 @@ def unstack(
         dim : hashable or iterable of hashable, optional
             Dimension(s) over which to unstack. By default unstacks all
             MultiIndexes.
-        fill_value : scalar, default: nan
-            value to be filled
+        fill_value : scalar or dict-like, default: nan
+            value to be filled. If a dict-like, maps variable names to
+            fill values. If not provided or if the dict-like does not
+            contain all variables, the dtype's NA value will be used.
         sparse : bool, default: False
             use sparse-array if True
 
@@ -3663,8 +3680,9 @@ def merge(
             - 'left': use indexes from ``self``
             - 'right': use indexes from ``other``
             - 'exact': error instead of aligning non-equal indexes
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like, maps
+            variable names (including coordinates) to fill values.
 
         Returns
         -------
@@ -5117,8 +5135,9 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs):
             Integer offset to shift along each of the given dimensions.
             Positive offsets shift to the right; negative offsets shift to the
             left.
-        fill_value : scalar, optional
-            Value to use for newly missing values
+        fill_value : scalar or dict-like, optional
+            Value to use for newly missing values. If a dict-like, maps
+            variable names (including coordinates) to fill values.
         **shifts_kwargs
             The keyword arguments form of ``shifts``.
             One of shifts or shifts_kwargs must be provided.
@@ -5153,8 +5172,14 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs):
         variables = {}
         for name, var in self.variables.items():
             if name in self.data_vars:
+                fill_value_ = (
+                    fill_value.get(name, dtypes.NA)
+                    if isinstance(fill_value, dict)
+                    else fill_value
+                )
+
                 var_shifts = {k: v for k, v in shifts.items() if k in var.dims}
-                variables[name] = var.shift(fill_value=fill_value, shifts=var_shifts)
+                variables[name] = var.shift(fill_value=fill_value_, shifts=var_shifts)
             else:
                 variables[name] = var
 

diff --git a/xarray/core/merge.py b/xarray/core/merge.py
@@ -666,8 +666,10 @@ def merge(
         - "override": if indexes are of same size, rewrite indexes to be
           those of the first object with that dimension. Indexes for the same
           dimension must have the same size in all objects.
-    fill_value : scalar, optional
-        Value to use for newly missing values
+    fill_value : scalar or dict-like, optional
+        Value to use for newly missing values. If a dict-like, maps
+        variable names to fill values. Use a data array's name to
+        refer to its values.
     combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
                     default: "drop"
         String indicating how to combine attrs of the objects being merged:

diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
@@ -601,18 +601,26 @@ def test_combine_concat_over_redundant_nesting(self):
         expected = Dataset({"x": [0]})
         assert_identical(expected, actual)
 
-    @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
+    @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
     def test_combine_nested_fill_value(self, fill_value):
         datasets = [
-            Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
-            Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
+            Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
+            Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
         ]
         if fill_value == dtypes.NA:
             # if we supply the default, we expect the missing value for a
             # float array
-            fill_value = np.nan
+            fill_value_a = fill_value_b = np.nan
+        elif isinstance(fill_value, dict):
+            fill_value_a = fill_value["a"]
+            fill_value_b = fill_value["b"]
+        else:
+            fill_value_a = fill_value_b = fill_value
         expected = Dataset(
-            {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
+            {
+                "a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
+                "b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
+            },
             {"x": [0, 1, 2]},
         )
         actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
@@ -349,18 +349,26 @@ def test_concat_multiindex(self):
         assert expected.equals(actual)
         assert isinstance(actual.x.to_index(), pd.MultiIndex)
 
-    @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
+    @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
     def test_concat_fill_value(self, fill_value):
         datasets = [
-            Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
-            Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
+            Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
+            Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
         ]
         if fill_value == dtypes.NA:
             # if we supply the default, we expect the missing value for a
             # float array
-            fill_value = np.nan
+            fill_value_a = fill_value_b = np.nan
+        elif isinstance(fill_value, dict):
+            fill_value_a = fill_value["a"]
+            fill_value_b = fill_value["b"]
+        else:
+            fill_value_a = fill_value_b = fill_value
         expected = Dataset(
-            {"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
+            {
+                "a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
+                "b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
+            },
             {"x": [0, 1, 2]},
         )
         actual = concat(datasets, dim="t", fill_value=fill_value)