Skip to content

Commit

Permalink
per-variable fill values (#4237)
Browse files Browse the repository at this point in the history
* implement the fill_value mapping

* get per-variable fill_values to work in DataArray.reindex

* Update xarray/core/dataarray.py

Co-authored-by: Stephan Hoyer <shoyer@google.com>

* check that the default value is used

* check that merge works with multiple fill values

* check that concat works with multiple fill values

* check that combine_nested works with multiple fill values

* check that Dataset.reindex and DataArray.reindex work

* check that aligning Datasets works

* check that Dataset.unstack works

* allow passing multiple fill values to full_like with datasets

* also allow overriding the dtype by variable

* document the dict fill values in Dataset.reindex

* document the changes to DataArray.reindex

* document the changes to unstack

* document the changes to align

* document the changes to concat and merge

* document the changes to Dataset.shift

* document the changes to combine_*

Co-authored-by: Stephan Hoyer <shoyer@google.com>
  • Loading branch information
keewis and shoyer authored Aug 24, 2020
1 parent 1a11d24 commit a36d0a1
Show file tree
Hide file tree
Showing 12 changed files with 284 additions and 81 deletions.
15 changes: 11 additions & 4 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ def align(
used in preference to the aligned indexes.
exclude : sequence of str, optional
Dimensions that must be excluded from alignment
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
Returns
-------
Expand Down Expand Up @@ -581,16 +583,21 @@ def reindex_variables(

for name, var in variables.items():
if name not in indexers:
if isinstance(fill_value, dict):
fill_value_ = fill_value.get(name, dtypes.NA)
else:
fill_value_ = fill_value

if sparse:
var = var._as_sparse(fill_value=fill_value)
var = var._as_sparse(fill_value=fill_value_)
key = tuple(
slice(None) if d in unchanged_dims else int_indexers.get(d, slice(None))
for d in var.dims
)
needs_masking = any(d in masked_dims for d in var.dims)

if needs_masking:
new_var = var._getitem_with_mask(key, fill_value=fill_value)
new_var = var._getitem_with_mask(key, fill_value=fill_value_)
elif all(is_full_slice(k) for k in key):
# no reindexing necessary
# here we need to manually deal with copying data, since
Expand Down
14 changes: 9 additions & 5 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,10 @@ def combine_nested(
Details are in the documentation of concat
coords : {"minimal", "different", "all" or list of str}, optional
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
join : {"outer", "inner", "left", "right", "exact"}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects
Expand Down Expand Up @@ -569,10 +571,12 @@ def combine_by_coords(
addition to the "minimal" data variables.
If objects are DataArrays, `data_vars` must be "all".
coords : {"minimal", "different", "all" or list of str}, optional
coords : {"minimal", "different", "all"} or list of str, optional
As per the "data_vars" kwarg, but for coordinate variables.
fill_value : scalar, optional
Value to use for newly missing values. If None, raises a ValueError if
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values. If None, raises a ValueError if
the passed Datasets do not create a complete hypercube.
join : {"outer", "inner", "left", "right", "exact"}, optional
String indicating how to combine differing indexes
Expand Down
58 changes: 51 additions & 7 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,10 +1364,13 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
----------
other : DataArray, Dataset or Variable
The reference object in input
fill_value : scalar
Value to fill the new object with before returning it.
dtype : dtype, optional
dtype of the new array. If omitted, it defaults to other.dtype.
fill_value : scalar or dict-like
Value to fill the new object with before returning it. If
other is a Dataset, may also be a dict-like mapping data
variables to fill values.
dtype : dtype or dict-like of dtype, optional
dtype of the new array. If a dict-like, maps dtypes to
variables. If omitted, it defaults to other.dtype.
Returns
-------
Expand Down Expand Up @@ -1427,6 +1430,34 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
* lat (lat) int64 1 2
* lon (lon) int64 0 1 2
>>> ds = xr.Dataset(
... {"a": ("x", [3, 5, 2]), "b": ("x", [9, 1, 0])}, coords={"x": [2, 4, 6]}
... )
>>> ds
<xarray.Dataset>
Dimensions: (x: 3)
Coordinates:
* x (x) int64 2 4 6
Data variables:
a (x) int64 3 5 2
b (x) int64 9 1 0
>>> xr.full_like(ds, fill_value={"a": 1, "b": 2})
<xarray.Dataset>
Dimensions: (x: 3)
Coordinates:
* x (x) int64 2 4 6
Data variables:
a (x) int64 1 1 1
b (x) int64 2 2 2
>>> xr.full_like(ds, fill_value={"a": 1, "b": 2}, dtype={"a": bool, "b": float})
<xarray.Dataset>
Dimensions: (x: 3)
Coordinates:
* x (x) int64 2 4 6
Data variables:
a (x) bool True True True
b (x) float64 2.0 2.0 2.0
See also
--------
Expand All @@ -1438,12 +1469,22 @@ def full_like(other, fill_value, dtype: DTypeLike = None):
from .dataset import Dataset
from .variable import Variable

if not is_scalar(fill_value):
raise ValueError(f"fill_value must be scalar. Received {fill_value} instead.")
if not is_scalar(fill_value) and not (
isinstance(other, Dataset) and isinstance(fill_value, dict)
):
raise ValueError(
f"fill_value must be scalar or, for datasets, a dict-like. Received {fill_value} instead."
)

if isinstance(other, Dataset):
if not isinstance(fill_value, dict):
fill_value = {k: fill_value for k in other.data_vars.keys()}

if not isinstance(dtype, dict):
dtype = {k: dtype for k in other.data_vars.keys()}

data_vars = {
k: _full_like_variable(v, fill_value, dtype)
k: _full_like_variable(v, fill_value.get(k, dtypes.NA), dtype.get(k, None))
for k, v in other.data_vars.items()
}
return Dataset(data_vars, coords=other.coords, attrs=other.attrs)
Expand All @@ -1466,6 +1507,9 @@ def _full_like_variable(other, fill_value, dtype: DTypeLike = None):
"""
from .variable import Variable

if fill_value is dtypes.NA:
fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype)

if isinstance(other.data, dask_array_type):
import dask.array

Expand Down
6 changes: 4 additions & 2 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,10 @@ def concat(
List of integer arrays which specifies the integer positions to which
to assign each dataset along the concatenated dimension. If not
supplied, objects are concatenated in the provided order.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
join : {"outer", "inner", "left", "right", "exact"}, optional
String indicating how to combine differing indexes
(excluding dim) in objects
Expand Down
26 changes: 20 additions & 6 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,8 +1308,10 @@ def reindex_like(
``copy=False`` and reindexing is unnecessary, or can be performed
with only slice operations, then the output may share memory with
the input. In either case, a new xarray object is always returned.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names (including coordinates) to fill values. Use this
data array's name to refer to the data array's values.
Returns
-------
Expand Down Expand Up @@ -1368,8 +1370,10 @@ def reindex(
Maximum distance between original and new labels for inexact
matches. The values of the index at the matching locations must
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names (including coordinates) to fill values. Use this
data array's name to refer to the data array's values.
**indexers_kwargs : {dim: indexer, ...}, optional
The keyword arguments form of ``indexers``.
One of indexers or indexers_kwargs must be provided.
Expand All @@ -1386,6 +1390,13 @@ def reindex(
align
"""
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "reindex")
if isinstance(fill_value, dict):
fill_value = fill_value.copy()
sentinel = object()
value = fill_value.pop(self.name, sentinel)
if value is not sentinel:
fill_value[_THIS_ARRAY] = value

ds = self._to_temp_dataset().reindex(
indexers=indexers,
method=method,
Expand Down Expand Up @@ -1867,8 +1878,11 @@ def unstack(
dim : hashable or sequence of hashable, optional
Dimension(s) over which to unstack. By default unstacks all
MultiIndexes.
fill_value : scalar, default: nan
value to be filled.
fill_value : scalar or dict-like, default: nan
value to be filled. If a dict-like, maps variable names to
fill values. Use the data array's name to refer to its
name. If not provided or if the dict-like does not contain
all variables, the dtype's NA value will be used.
sparse : bool, default: False
use sparse-array if True
Expand Down
47 changes: 36 additions & 11 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2313,8 +2313,9 @@ def reindex_like(
``copy=False`` and reindexing is unnecessary, or can be performed
with only slice operations, then the output may share memory with
the input. In either case, a new xarray object is always returned.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like maps
variable names to fill values.
Returns
-------
Expand Down Expand Up @@ -2373,8 +2374,9 @@ def reindex(
``copy=False`` and reindexing is unnecessary, or can be performed
with only slice operations, then the output may share memory with
the input. In either case, a new xarray object is always returned.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like,
maps variable names (including coordinates) to fill values.
sparse : bool, default: False
use sparse-array.
**indexers_kwargs : {dim: indexer, ...}, optional
Expand Down Expand Up @@ -2441,6 +2443,19 @@ def reindex(
temperature (station) float64 18.84 0.0 19.22 0.0
pressure (station) float64 324.1 0.0 122.8 0.0
We can also use different fill values for each variable.
>>> x.reindex(
... {"station": new_index}, fill_value={"temperature": 0, "pressure": 100}
... )
<xarray.Dataset>
Dimensions: (station: 4)
Coordinates:
* station (station) object 'boston' 'austin' 'seattle' 'lincoln'
Data variables:
temperature (station) float64 18.84 0.0 19.22 0.0
pressure (station) float64 324.1 100.0 122.8 100.0
Because the index is not monotonically increasing or decreasing, we cannot use arguments
to the keyword method to fill the `NaN` values.
Expand Down Expand Up @@ -3544,8 +3559,10 @@ def unstack(
dim : hashable or iterable of hashable, optional
Dimension(s) over which to unstack. By default unstacks all
MultiIndexes.
fill_value : scalar, default: nan
value to be filled
fill_value : scalar or dict-like, default: nan
value to be filled. If a dict-like, maps variable names to
fill values. If not provided or if the dict-like does not
contain all variables, the dtype's NA value will be used.
sparse : bool, default: False
use sparse-array if True
Expand Down Expand Up @@ -3663,8 +3680,9 @@ def merge(
- 'left': use indexes from ``self``
- 'right': use indexes from ``other``
- 'exact': error instead of aligning non-equal indexes
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names (including coordinates) to fill values.
Returns
-------
Expand Down Expand Up @@ -5117,8 +5135,9 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs):
Integer offset to shift along each of the given dimensions.
Positive offsets shift to the right; negative offsets shift to the
left.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names (including coordinates) to fill values.
**shifts_kwargs
The keyword arguments form of ``shifts``.
One of shifts or shifts_kwargs must be provided.
Expand Down Expand Up @@ -5153,8 +5172,14 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs):
variables = {}
for name, var in self.variables.items():
if name in self.data_vars:
fill_value_ = (
fill_value.get(name, dtypes.NA)
if isinstance(fill_value, dict)
else fill_value
)

var_shifts = {k: v for k, v in shifts.items() if k in var.dims}
variables[name] = var.shift(fill_value=fill_value, shifts=var_shifts)
variables[name] = var.shift(fill_value=fill_value_, shifts=var_shifts)
else:
variables[name] = var

Expand Down
6 changes: 4 additions & 2 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,8 +666,10 @@ def merge(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
fill_value : scalar, optional
Value to use for newly missing values
fill_value : scalar or dict-like, optional
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
String indicating how to combine attrs of the objects being merged:
Expand Down
18 changes: 13 additions & 5 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,18 +601,26 @@ def test_combine_concat_over_redundant_nesting(self):
expected = Dataset({"x": [0]})
assert_identical(expected, actual)

@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
def test_combine_nested_fill_value(self, fill_value):
datasets = [
Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
]
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
fill_value = np.nan
fill_value_a = fill_value_b = np.nan
elif isinstance(fill_value, dict):
fill_value_a = fill_value["a"]
fill_value_b = fill_value["b"]
else:
fill_value_a = fill_value_b = fill_value
expected = Dataset(
{"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
{
"a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
"b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
},
{"x": [0, 1, 2]},
)
actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value)
Expand Down
18 changes: 13 additions & 5 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,18 +349,26 @@ def test_concat_multiindex(self):
assert expected.equals(actual)
assert isinstance(actual.x.to_index(), pd.MultiIndex)

@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0])
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0, {"a": 2, "b": 1}])
def test_concat_fill_value(self, fill_value):
datasets = [
Dataset({"a": ("x", [2, 3]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "x": [0, 1]}),
Dataset({"a": ("x", [2, 3]), "b": ("x", [-2, 1]), "x": [1, 2]}),
Dataset({"a": ("x", [1, 2]), "b": ("x", [3, -1]), "x": [0, 1]}),
]
if fill_value == dtypes.NA:
# if we supply the default, we expect the missing value for a
# float array
fill_value = np.nan
fill_value_a = fill_value_b = np.nan
elif isinstance(fill_value, dict):
fill_value_a = fill_value["a"]
fill_value_b = fill_value["b"]
else:
fill_value_a = fill_value_b = fill_value
expected = Dataset(
{"a": (("t", "x"), [[fill_value, 2, 3], [1, 2, fill_value]])},
{
"a": (("t", "x"), [[fill_value_a, 2, 3], [1, 2, fill_value_a]]),
"b": (("t", "x"), [[fill_value_b, -2, 1], [3, -1, fill_value_b]]),
},
{"x": [0, 1, 2]},
)
actual = concat(datasets, dim="t", fill_value=fill_value)
Expand Down
Loading

0 comments on commit a36d0a1

Please sign in to comment.