Skip to content

Commit

Permalink
Allow cov & corr to handle missing values (#4351)
Browse files Browse the repository at this point in the history
* Allow cov & corr to handle missing values

* Remove artifacts

* Fix floating assert

* Update xarray/tests/test_computation.py

Co-authored-by: keewis <keewis@users.noreply.github.com>

* Add test for multiple explicit dims

* Use np.ma rather than drop=True

* Add whatsnew

* reformat

Co-authored-by: keewis <keewis@users.noreply.github.com>
  • Loading branch information
max-sixty and keewis authored Aug 24, 2020
1 parent d3536b9 commit 1a11d24
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 13 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ New Features
- ``min_count`` can be supplied to reductions such as ``.sum`` when specifying
multiple dimension to reduce over. (:pull:`4356`)
By `Maximilian Roos <https://github.com/max-sixty>`_.
- :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values. (:pull:`4351`)
By `Maximilian Roos <https://github.com/max-sixty>`_.
- Build ``CFTimeIndex.__repr__`` explicitly as :py:class:`pandas.Index`. Add ``calendar`` as a new
property for :py:class:`CFTimeIndex` and show ``calendar`` and ``length`` in
``CFTimeIndex.__repr__`` (:issue:`2416`, :pull:`4092`)
Expand Down
4 changes: 3 additions & 1 deletion xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,9 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None):
# N.B. `skipna=False` is required or there is a bug when computing
# auto-covariance. E.g. Try xr.cov(da,da) for
# da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count)
cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=True, min_count=1) / (
valid_count
)

if method == "cov":
return cov
Expand Down
36 changes: 24 additions & 12 deletions xarray/tests/test_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,9 @@ def test_vectorize_exclude_dims_dask():
assert_identical(expected, actual)


with raises_regex(TypeError, "Only xr.DataArray is supported"):
xr.corr(xr.Dataset(), xr.Dataset())
def test_corr_only_dataarray():
with pytest.raises(TypeError, match="Only xr.DataArray is supported"):
xr.corr(xr.Dataset(), xr.Dataset())


def arrays_w_tuples():
Expand Down Expand Up @@ -984,12 +985,14 @@ def np_cov_ind(ts1, ts2, a, x):
ts1, ts2 = broadcast(ts1, ts2)
valid_values = ts1.notnull() & ts2.notnull()

# While dropping isn't ideal here, numpy will return nan
# if any segment contains a NaN.
ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.cov(
ts1.sel(a=a, x=x).data.flatten(),
ts2.sel(a=a, x=x).data.flatten(),
return np.ma.cov(
np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()),
np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()),
ddof=ddof,
)[0, 1]

Expand All @@ -1010,7 +1013,11 @@ def np_cov(ts1, ts2):
ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=ddof)[0, 1]
return np.ma.cov(
np.ma.masked_invalid(ts1.data.flatten()),
np.ma.masked_invalid(ts2.data.flatten()),
ddof=ddof,
)[0, 1]

expected = np_cov(da_a, da_b)
actual = xr.cov(da_a, da_b, dim=dim, ddof=ddof)
Expand All @@ -1033,8 +1040,9 @@ def np_corr_ind(ts1, ts2, a, x):
ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.corrcoef(
ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten()
return np.ma.corrcoef(
np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()),
np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()),
)[0, 1]

expected = np.zeros((3, 4))
Expand All @@ -1054,7 +1062,10 @@ def np_corr(ts1, ts2):
ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.corrcoef(ts1.data.flatten(), ts2.data.flatten())[0, 1]
return np.ma.corrcoef(
np.ma.masked_invalid(ts1.data.flatten()),
np.ma.masked_invalid(ts2.data.flatten()),
)[0, 1]

expected = np_corr(da_a, da_b)
actual = xr.corr(da_a, da_b, dim)
Expand Down Expand Up @@ -1084,13 +1095,14 @@ def test_covcorr_consistency(da_a, da_b, dim):
@pytest.mark.parametrize(
"da_a", arrays_w_tuples()[0],
)
@pytest.mark.parametrize("dim", [None, "time", "x"])
@pytest.mark.parametrize("dim", [None, "time", "x", ["time", "x"]])
def test_autocov(da_a, dim):
# Testing that the autocovariance*(N-1) is ~=~ to the variance matrix
# 1. Ignore the nans
valid_values = da_a.notnull()
da_a = da_a.where(valid_values)
expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False)
# Because we're using ddof=1, this requires > 1 value in each sample
da_a = da_a.where(valid_values.sum(dim=dim) > 1)
expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1)
actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1)
assert_allclose(actual, expected)

Expand Down

0 comments on commit 1a11d24

Please sign in to comment.