From 1a11d249a8338dad7c533f2ea7c365a823022d15 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 24 Aug 2020 12:39:10 -0400 Subject: [PATCH] Allow cov & corr to handle missing values (#4351) * Allow cov & corr to handle missing values * Remove artifacts * Fix floating assert * Update xarray/tests/test_computation.py Co-authored-by: keewis * Add test for multiple explicit dims * Use np.ma rather than drop=True * Add whatsnew * reformat Co-authored-by: keewis --- doc/whats-new.rst | 2 ++ xarray/core/computation.py | 4 +++- xarray/tests/test_computation.py | 36 +++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index db05c38e1c9..ce73b01bda6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,8 @@ New Features - ``min_count`` can be supplied to reductions such as ``.sum`` when specifying multiple dimension to reduce over. (:pull:`4356`) By `Maximilian Roos `_. +- :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values. (:pull:`4351`) + By `Maximilian Roos `_. - Build ``CFTimeIndex.__repr__`` explicitly as :py:class:`pandas.Index`. Add ``calendar`` as a new property for :py:class:`CFTimeIndex` and show ``calendar`` and ``length`` in ``CFTimeIndex.__repr__`` (:issue:`2416`, :pull:`4092`) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8231c2470d6..e9110cbfead 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1277,7 +1277,9 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): # N.B. `skipna=False` is required or there is a bug when computing # auto-covariance. E.g. Try xr.cov(da,da) for # da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]) - cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count) + cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=True, min_count=1) / ( + valid_count + ) if method == "cov": return cov diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 32505f24ac4..5df783e4878 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -940,8 +940,9 @@ def test_vectorize_exclude_dims_dask(): assert_identical(expected, actual) -with raises_regex(TypeError, "Only xr.DataArray is supported"): - xr.corr(xr.Dataset(), xr.Dataset()) +def test_corr_only_dataarray(): + with pytest.raises(TypeError, match="Only xr.DataArray is supported"): + xr.corr(xr.Dataset(), xr.Dataset()) def arrays_w_tuples(): @@ -984,12 +985,14 @@ def np_cov_ind(ts1, ts2, a, x): ts1, ts2 = broadcast(ts1, ts2) valid_values = ts1.notnull() & ts2.notnull() + # While dropping isn't ideal here, numpy will return nan + # if any segment contains a NaN. ts1 = ts1.where(valid_values) ts2 = ts2.where(valid_values) - return np.cov( - ts1.sel(a=a, x=x).data.flatten(), - ts2.sel(a=a, x=x).data.flatten(), + return np.ma.cov( + np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()), + np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()), ddof=ddof, )[0, 1] @@ -1010,7 +1013,11 @@ def np_cov(ts1, ts2): ts1 = ts1.where(valid_values) ts2 = ts2.where(valid_values) - return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=ddof)[0, 1] + return np.ma.cov( + np.ma.masked_invalid(ts1.data.flatten()), + np.ma.masked_invalid(ts2.data.flatten()), + ddof=ddof, + )[0, 1] expected = np_cov(da_a, da_b) actual = xr.cov(da_a, da_b, dim=dim, ddof=ddof) @@ -1033,8 +1040,9 @@ def np_corr_ind(ts1, ts2, a, x): ts1 = ts1.where(valid_values) ts2 = ts2.where(valid_values) - return np.corrcoef( - ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten() + return np.ma.corrcoef( + np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()), + np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()), )[0, 1] expected = np.zeros((3, 4)) @@ -1054,7 +1062,10 @@ def np_corr(ts1, ts2): ts1 = ts1.where(valid_values) ts2 = ts2.where(valid_values) - return np.corrcoef(ts1.data.flatten(), ts2.data.flatten())[0, 1] + return np.ma.corrcoef( + np.ma.masked_invalid(ts1.data.flatten()), + np.ma.masked_invalid(ts2.data.flatten()), + )[0, 1] expected = np_corr(da_a, da_b) actual = xr.corr(da_a, da_b, dim) @@ -1084,13 +1095,14 @@ def test_covcorr_consistency(da_a, da_b, dim): @pytest.mark.parametrize( "da_a", arrays_w_tuples()[0], ) -@pytest.mark.parametrize("dim", [None, "time", "x"]) +@pytest.mark.parametrize("dim", [None, "time", "x", ["time", "x"]]) def test_autocov(da_a, dim): # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix # 1. Ignore the nans valid_values = da_a.notnull() - da_a = da_a.where(valid_values) - expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False) + # Because we're using ddof=1, this requires > 1 value in each sample + da_a = da_a.where(valid_values.sum(dim=dim) > 1) + expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1) actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1) assert_allclose(actual, expected)