pydata · r-beer · Jan 4, 2019 · Jan 4, 2019 · Nov 20, 2019 · Nov 20, 2019
diff --git a/doc/api.rst b/doc/api.rst
@@ -30,6 +30,8 @@ Top-level functions
    zeros_like
    ones_like
    dot
+   cov
+   corr
    map_blocks
 
 Dataset

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -25,6 +25,8 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+- Added - :py:func:`xarray.cov` and :py:func:`xarray.corr` (:pull:`3550`).
+  By `Robin Beer <https://github.com/r-beer>`_.
 
 
 Bug fixes

diff --git a/xarray/__init__.py b/xarray/__init__.py
@@ -10,7 +10,7 @@
 from .core.common import full_like, zeros_like, ones_like
 from .core.concat import concat
 from .core.combine import combine_by_coords, combine_nested, auto_combine
-from .core.computation import apply_ufunc, dot, where
+from .core.computation import apply_ufunc, dot, where, cov, corr
 from .core.extensions import register_dataarray_accessor, register_dataset_accessor
 from .core.variable import as_variable, Variable, IndexVariable, Coordinate
 from .core.dataset import Dataset

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -22,9 +22,8 @@
 )
 
 import numpy as np
-
 from . import duck_array_ops, utils
-from .alignment import deep_align
+from .alignment import broadcast, deep_align
 from .merge import merge_coordinates_without_align
 from .pycompat import dask_array_type
 from .utils import is_dict_like
@@ -1047,6 +1046,174 @@ def earth_mover_distance(first_samples,
         return apply_array_ufunc(func, *args, dask=dask)
 
 
+def cov(da_a, da_b, dim=None):
+    """Compute covariance between two DataArray objects along a shared dimension.
+
+    Parameters
+    ----------
+    da_a: DataArray (or Variable) object
+        Array to compute.
+    da_b: DataArray (or Variable) object
+        Array to compute.
+    dim : str, optional
+        The dimension along which the covariance will be computed
+
+    Returns
+    -------
+    covariance: DataArray
+
+    See also
+    --------
+    pandas.Series.cov: corresponding pandas function
+    xr.corr: respective function to calculate correlation
+
+    Examples
+    --------
+
+    >>> da_a = DataArray(np.random.random((3, 5)),
+    ...                  dims=("space", "time"),
+    ...                  coords=[('space', ['IA', 'IL', 'IN']),
+    ...                          ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
+    >>> da_a
+    <xarray.DataArray (space: 3, time: 5)>
+    array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438],
+            [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536],
+            [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    * time     (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
+
+    >>> da_b = DataArray(np.random.random((3, 5)),
+    ...                  dims=("space", "time"),
+    ...                  coords=[('space', ['IA', 'IL', 'IN']),
+    ...                          ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
+    >>> da_b
+    <xarray.DataArray (space: 3, time: 5)>
+    array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ],
+            [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ],
+            [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    * time     (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
+    >>> xr.cov(da_a, da_b)
+    <xarray.DataArray ()>
+    array(0.03823054)
+    >>> xr.cov(da_a, da_b, dim='time')
+    <xarray.DataArray (space: 3)>
+    array([0.00207952, 0.01024296, 0.08214707])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    """
+
+    # 1. Broadcast the two arrays
+    da_a, da_b = broadcast(da_a, da_b)
+
+    # 2. Ignore the nans
+    valid_values = da_a.notnull() & da_b.notnull()
+    da_a = da_a.where(
+        valid_values, drop=True
+    )  # TODO: avoid drop as explained in https://github.com/pydata/xarray/pull/2652#discussion_r245492002
+    da_b = da_b.where(valid_values, drop=True)
+    valid_count = (
+        valid_values.sum(dim) - 1
+    )  # as in pandas.Series.cov, default to unbiased "N - 1" normalization
+    # TODO: add parameter "bias" to decide whether or not N-1 normalization should be used
+
+    # 3. Compute mean and standard deviation along the given dim
+    demeaned_da_a = da_a - da_a.mean(dim=dim)
+    demeaned_da_b = da_b - da_b.mean(dim=dim)
+
+    # 4. Compute covariance along the given dim
+    cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim) / (valid_count)
+
+    return cov
+
+
+def corr(da_a, da_b, dim=None):
+    """Compute correlation between two DataArray objects along a shared dimension.
+
+    Parameters
+    ----------
+    da_a: DataArray (or Variable) object
+        Array to compute.
+    da_b: DataArray (or Variable) object
+        Array to compute.
+    dim: str, optional
+        The dimension along which the correlation will be computed
+
+    Returns
+    -------
+    correlation: DataArray
+
+    See also
+    --------
+    pandas.Series.corr: corresponding pandas function
+    xr.cov: underlying covariance function
+
+    Examples
+    --------
+
+    >>> da_a = DataArray(np.random.random((3, 5)),
+    ...                  dims=("space", "time"),
+    ...                  coords=[('space', ['IA', 'IL', 'IN']),
+    ...                          ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
+    >>> da_a
+    <xarray.DataArray (space: 3, time: 5)>
+    array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438],
+            [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536],
+            [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    * time     (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
+
+    >>> da_b = DataArray(np.random.random((3, 5)),
+    ...                  dims=("space", "time"),
+    ...                  coords=[('space', ['IA', 'IL', 'IN']),
+    ...                          ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
+    >>> da_b
+    <xarray.DataArray (space: 3, time: 5)>
+    array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ],
+            [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ],
+            [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    * time     (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
+    >>> xr.corr(da_a, da_b)
+    <xarray.DataArray ()>
+    array(0.67407116)
+    >>> xr.corr(da_a, da_b, dim='time')
+    <xarray.DataArray (space: 3)>
+    array([0.23150267, 0.24900968, 0.9061562 ])
+    Coordinates:
+    * space    (space) <U2 'IA' 'IL' 'IN'
+    """
+    from .dataarray import DataArray
+
+    if any(not isinstance(arr, (Variable, DataArray)) for arr in [da_a, da_b]):
+        raise TypeError(
+            "Only xr.DataArray and xr.Variable are supported."
+            "Given {}.".format([type(arr) for arr in [da_a, da_b]])
+        )
+
+    # 1. Broadcast the two arrays
+    da_a, da_b = broadcast(da_a, da_b)
+
+    # 2. Ignore the nans
+    valid_values = da_a.notnull() & da_b.notnull()
+    da_a = da_a.where(valid_values, drop=True)
+    da_b = da_b.where(
+        valid_values, drop=True
+    )  # TODO: avoid drop as explained in https://github.com/pydata/xarray/pull/2652#discussion_r245492002
+
+    # 3. Compute correlation based on standard deviations and cov()
+    da_a_std = da_a.std(dim=dim)
+    da_b_std = da_b.std(dim=dim)
+
+    corr = cov(da_a, da_b, dim=dim) / (da_a_std * da_b_std)
+
+    return corr
+
+
 def dot(*arrays, dims=None, **kwargs):
     """Generalized dot product for xarray objects. Like np.einsum, but
     provides a simpler interface based on array dimensions.

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -2834,7 +2834,6 @@ def dot(
         >>> da = DataArray(da_vals, dims=['x', 'y', 'z'])
         >>> dm_vals = np.arange(4)
         >>> dm = DataArray(dm_vals, dims=['z'])
-
         >>> dm.dims
         ('z')
         >>> da.dims

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2439,6 +2439,7 @@ def skip_if_not_engine(engine):
         pytest.importorskip(engine)
 
 
+@requires_scipy_or_netCDF4
 @requires_dask
 @pytest.mark.filterwarnings("ignore:use make_scale(name) instead")
 def test_open_mfdataset_manyfiles(

diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -23,6 +23,28 @@
 from . import has_dask, raises_regex, requires_dask
 
 
+@pytest.fixture(params=[1])
+def da(request):
+    if request.param == 1:
+        times = pd.date_range("2000-01-01", freq="1D", periods=21)
+        values = np.random.random((3, 21, 4))
+        da = xr.DataArray(values, dims=("a", "time", "x"))
+        da["time"] = times
+        return da
+
+    if request.param == 2:
+        return xr.DataArray(
+            [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"
+        )
+
+    if request.param == "repeating_ints":
+        return xr.DataArray(
+            np.tile(np.arange(12), 5).reshape(5, 4, 3),
+            coords={"x": list("abc"), "y": list("defg")},
+            dims=list("zyx"),
+        )
+
+
 def assert_identical(a, b):
     if hasattr(a, "identical"):
         msg = f"not identical:\n{a!r}\n{b!r}"
@@ -789,6 +811,84 @@ def func(x):
     assert_identical(expected, actual)
 
 
+def test_corr(da):
+
+    # other: select missaligned data, and smooth it to dampen the correlation with self.
+    da_smooth = da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(dim="time")
+
+    da = da.isel(time=range(0, 18))
+
+    def select_pts(array):
+        return array.sel(a=1, x=2)
+
+    # Test #1: Misaligned 1-D dataarrays with missing values
+    ts1 = select_pts(da.copy())
+    ts2 = select_pts(da_smooth.copy())
+
+    def pd_corr(ts1, ts2):
+        """Ensure the ts are aligned and missing values ignored"""
+        ts1, ts2 = xr.align(ts1, ts2)
+        valid_values = ts1.notnull() & ts2.notnull()
+
+        ts1 = ts1.where(valid_values, drop=True)
+        ts2 = ts2.where(valid_values, drop=True)
+
+        return ts1.to_series().corr(ts2.to_series())
+
+    expected = pd_corr(ts1, ts2)
+    actual = xr.corr(ts1, ts2)
+    assert np.allclose(expected, actual)
+
+    # Test #2: Misaligned N-D dataarrays with missing values
+    actual_ND = xr.corr(da, da_smooth, dim="time")
+    actual = select_pts(actual_ND)
+    assert np.allclose(expected, actual)
+
+    # Test #3: One 1-D dataarray and another N-D dataarray; misaligned and having missing values
+    actual_ND = xr.corr(da_smooth, ts1, dim="time")
+    actual = select_pts(actual_ND)
+    assert np.allclose(expected, actual)
+
+
+def test_cov(da):
+
+    # other: select missaligned data, and smooth it to dampen the correlation with self.
+    da_smooth = da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(dim="time")
+
+    da = da.isel(time=range(0, 18))
+
+    def select_pts(array):
+        return array.sel(a=1, x=2)
+
+    # Test #1: Misaligned 1-D dataarrays with missing values
+    ts1 = select_pts(da.copy())
+    ts2 = select_pts(da_smooth.copy())
+
+    def pd_cov(ts1, ts2):
+        """Ensure the ts are aligned and missing values ignored"""
+        ts1, ts2 = xr.align(ts1, ts2)
+        valid_values = ts1.notnull() & ts2.notnull()
+
+        ts1 = ts1.where(valid_values, drop=True)
+        ts2 = ts2.where(valid_values, drop=True)
+
+        return ts1.to_series().cov(ts2.to_series())
+
+    expected = pd_cov(ts1, ts2)
+    actual = xr.cov(ts1, ts2)
+    assert np.allclose(expected, actual)
+
+    # Test #2: Misaligned N-D dataarrays with missing values
+    actual_ND = xr.cov(da, da_smooth, dim="time")
+    actual = select_pts(actual_ND)
+    assert np.allclose(expected, actual)
+
+    # Test #3: One 1-D dataarray and another N-D dataarray; misaligned and having missing values
+    actual_ND = xr.cov(ts1, da_smooth, dim="time")
+    actual = select_pts(actual_ND)
+    assert np.allclose(expected, actual)
+
+
 def pandas_median(x):
     return pd.Series(x).median()
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,8 @@ Top-level functions @@
        zeros_like
        ones_like
        dot
+       cov
+       corr
        map_blocks
     Dataset
@@ Expand Down @@