From 1a11d249a8338dad7c533f2ea7c365a823022d15 Mon Sep 17 00:00:00 2001
From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>
Date: Mon, 24 Aug 2020 12:39:10 -0400
Subject: [PATCH] Allow cov & corr to handle missing values (#4351)

* Allow cov & corr to handle missing values

* Remove artifacts

* Fix floating assert

* Update xarray/tests/test_computation.py

Co-authored-by: keewis <keewis@users.noreply.github.com>

* Add test for multiple explicit dims

* Use np.ma rather than drop=True

* Add whatsnew

* reformat

Co-authored-by: keewis <keewis@users.noreply.github.com>
---
 doc/whats-new.rst                |  2 ++
 xarray/core/computation.py       |  4 +++-
 xarray/tests/test_computation.py | 36 +++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index db05c38e1c9..ce73b01bda6 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -35,6 +35,8 @@ New Features
 - ``min_count`` can be supplied to reductions such as ``.sum`` when specifying
   multiple dimension to reduce over. (:pull:`4356`) 
   By `Maximilian Roos <https://github.com/max-sixty>`_.
+- :py:func:`xarray.cov` and :py:func:`xarray.corr` now handle missing values. (:pull:`4351`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 - Build ``CFTimeIndex.__repr__`` explicitly as :py:class:`pandas.Index`. Add ``calendar`` as a new
   property for :py:class:`CFTimeIndex` and show ``calendar`` and ``length`` in
   ``CFTimeIndex.__repr__`` (:issue:`2416`, :pull:`4092`)
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
index 8231c2470d6..e9110cbfead 100644
--- a/xarray/core/computation.py
+++ b/xarray/core/computation.py
@@ -1277,7 +1277,9 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None):
     # N.B. `skipna=False` is required or there is a bug when computing
     # auto-covariance. E.g. Try xr.cov(da,da) for
     # da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
-    cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count)
+    cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=True, min_count=1) / (
+        valid_count
+    )
 
     if method == "cov":
         return cov
diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
index 32505f24ac4..5df783e4878 100644
--- a/xarray/tests/test_computation.py
+++ b/xarray/tests/test_computation.py
@@ -940,8 +940,9 @@ def test_vectorize_exclude_dims_dask():
     assert_identical(expected, actual)
 
 
-with raises_regex(TypeError, "Only xr.DataArray is supported"):
-    xr.corr(xr.Dataset(), xr.Dataset())
+def test_corr_only_dataarray():
+    with pytest.raises(TypeError, match="Only xr.DataArray is supported"):
+        xr.corr(xr.Dataset(), xr.Dataset())
 
 
 def arrays_w_tuples():
@@ -984,12 +985,14 @@ def np_cov_ind(ts1, ts2, a, x):
             ts1, ts2 = broadcast(ts1, ts2)
             valid_values = ts1.notnull() & ts2.notnull()
 
+            # While dropping isn't ideal here, numpy will return nan
+            # if any segment contains a NaN.
             ts1 = ts1.where(valid_values)
             ts2 = ts2.where(valid_values)
 
-            return np.cov(
-                ts1.sel(a=a, x=x).data.flatten(),
-                ts2.sel(a=a, x=x).data.flatten(),
+            return np.ma.cov(
+                np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()),
+                np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()),
                 ddof=ddof,
             )[0, 1]
 
@@ -1010,7 +1013,11 @@ def np_cov(ts1, ts2):
             ts1 = ts1.where(valid_values)
             ts2 = ts2.where(valid_values)
 
-            return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=ddof)[0, 1]
+            return np.ma.cov(
+                np.ma.masked_invalid(ts1.data.flatten()),
+                np.ma.masked_invalid(ts2.data.flatten()),
+                ddof=ddof,
+            )[0, 1]
 
         expected = np_cov(da_a, da_b)
         actual = xr.cov(da_a, da_b, dim=dim, ddof=ddof)
@@ -1033,8 +1040,9 @@ def np_corr_ind(ts1, ts2, a, x):
             ts1 = ts1.where(valid_values)
             ts2 = ts2.where(valid_values)
 
-            return np.corrcoef(
-                ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten()
+            return np.ma.corrcoef(
+                np.ma.masked_invalid(ts1.sel(a=a, x=x).data.flatten()),
+                np.ma.masked_invalid(ts2.sel(a=a, x=x).data.flatten()),
             )[0, 1]
 
         expected = np.zeros((3, 4))
@@ -1054,7 +1062,10 @@ def np_corr(ts1, ts2):
             ts1 = ts1.where(valid_values)
             ts2 = ts2.where(valid_values)
 
-            return np.corrcoef(ts1.data.flatten(), ts2.data.flatten())[0, 1]
+            return np.ma.corrcoef(
+                np.ma.masked_invalid(ts1.data.flatten()),
+                np.ma.masked_invalid(ts2.data.flatten()),
+            )[0, 1]
 
         expected = np_corr(da_a, da_b)
         actual = xr.corr(da_a, da_b, dim)
@@ -1084,13 +1095,14 @@ def test_covcorr_consistency(da_a, da_b, dim):
 @pytest.mark.parametrize(
     "da_a", arrays_w_tuples()[0],
 )
-@pytest.mark.parametrize("dim", [None, "time", "x"])
+@pytest.mark.parametrize("dim", [None, "time", "x", ["time", "x"]])
 def test_autocov(da_a, dim):
     # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix
     # 1. Ignore the nans
     valid_values = da_a.notnull()
-    da_a = da_a.where(valid_values)
-    expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False)
+    # Because we're using ddof=1, this requires > 1 value in each sample
+    da_a = da_a.where(valid_values.sum(dim=dim) > 1)
+    expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1)
     actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1)
     assert_allclose(actual, expected)