From 176fcdf710b0431acfde7e92b8d65469deb242e9 Mon Sep 17 00:00:00 2001
From: Victoria <112418493+veni-vidi-vici-dormivi@users.noreply.github.com>
Date: Sat, 25 Jan 2025 14:11:39 +0100
Subject: [PATCH] Test `draw_auto_regression_*` with `DataTree` of seeds (#591)

* extend draw_autoregression_monthly to handle datatree of seeds

* test drawing with multiple seeds

* test for seed as dataset to draw_autoregression_monthly

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Mathias Hauser <mathause@users.noreply.github.com>
---
 mesmer/stats/_auto_regression.py   |  11 +-
 tests/unit/test_auto_regression.py | 220 +++++++++++++++++++++++++----
 2 files changed, 201 insertions(+), 30 deletions(-)

diff --git a/mesmer/stats/_auto_regression.py b/mesmer/stats/_auto_regression.py
index 8d1352ac..5af94bda 100644
--- a/mesmer/stats/_auto_regression.py
+++ b/mesmer/stats/_auto_regression.py
@@ -919,8 +919,10 @@ def draw_auto_regression_monthly(
         that will be the assigned time dimension of the predictions.
     n_realisations : int
         The number of realisations to draw.
-    seed : int
-        Seed used to initialize the pseudo-random number generator.
+    seed : int | xr.Dataset
+        Seed used to initialize the pseudo-random number generator. Can be an int or a xr.Dataset that
+        contains a single variable "seed" with the seed value, used if this function is mapped over
+        a DataTree to draw samples for multiple scenarios.
     buffer : int
         Buffer to initialize the autoregressive process (ensures that start at 0 does
         not influence overall result).
@@ -953,6 +955,9 @@ def draw_auto_regression_monthly(
         covariance, "covariance", ndim=3, shape=(n_months, size, size)
     )
 
+    if isinstance(seed, xr.Dataset):
+        seed = int(seed.seed.values)
+
     result = _draw_ar_corr_monthly_xr_internal(
         intercept=ar_params.intercept,
         slope=ar_params.slope,
@@ -965,7 +970,7 @@ def draw_auto_regression_monthly(
         realisation_dim=realisation_dim,
     )
 
-    return result
+    return result.rename("samples")
 
 
 def _draw_ar_corr_monthly_xr_internal(
diff --git a/tests/unit/test_auto_regression.py b/tests/unit/test_auto_regression.py
index d5227acb..12e482e3 100644
--- a/tests/unit/test_auto_regression.py
+++ b/tests/unit/test_auto_regression.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import pytest
 import xarray as xr
+from datatree import DataTree, map_over_subtree
 from packaging.version import Version
 
 import mesmer
@@ -146,30 +147,73 @@ def test_draw_auto_regression_uncorrelated_2D_errors(ar_params_2D):
 
 
 @pytest.mark.parametrize("time", (3, 5))
-@pytest.mark.parametrize("realization", (2, 4))
+@pytest.mark.parametrize("realisation", (2, 4))
 @pytest.mark.parametrize("time_dim", ("time", "ts"))
-@pytest.mark.parametrize("realization_dim", ("realization", "sample"))
+@pytest.mark.parametrize("realisation_dim", ("realisation", "sample"))
 @pytest.mark.parametrize("seed", [0, xr.Dataset({"seed": 0})])
 def test_draw_auto_regression_uncorrelated(
-    ar_params_1D, time, realization, time_dim, realization_dim, seed
+    ar_params_1D, time, realisation, time_dim, realisation_dim, seed
 ):
 
     result = mesmer.stats.draw_auto_regression_uncorrelated(
         ar_params_1D,
         time=time,
-        realisation=realization,
+        realisation=realisation,
         seed=seed,
         buffer=0,
         time_dim=time_dim,
-        realisation_dim=realization_dim,
+        realisation_dim=realisation_dim,
     )
 
     _check_dataarray_form(
         result,
         "result",
         ndim=2,
-        required_dims={time_dim, realization_dim},
-        shape=(time, realization),
+        required_dims={time_dim, realisation_dim},
+        shape=(time, realisation),
+    )
+
+
+def test_draw_auto_regression_uncorrelated_dt(ar_params_1D):
+    seeds = DataTree.from_dict(
+        {
+            "scen1": xr.DataArray(np.array([25]), name="seed"),
+            "scen2": xr.DataArray(np.array([42]), name="seed"),
+        }
+    )
+    n_realisation = 10
+    n_ts = 20
+
+    result = map_over_subtree(mesmer.stats.draw_auto_regression_uncorrelated)(
+        ar_params_1D,
+        time=n_ts,
+        realisation=n_realisation,
+        seed=seeds,
+        buffer=10,
+        time_dim="time",
+        realisation_dim="realisation",
+    )
+
+    assert result["scen1"].to_dataset().var() is not result["scen2"].to_dataset().var()
+    _check_dataset_form(
+        result["scen1"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataset_form(
+        result["scen2"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataarray_form(
+        result["scen1"].samples,
+        "samples",
+        ndim=2,
+        required_dims={"time", "realisation"},
+        shape=(n_ts, n_realisation),
+    )
+    _check_dataarray_form(
+        result["scen2"].samples,
+        "samples",
+        ndim=2,
+        required_dims={"time", "realisation"},
+        shape=(n_ts, n_realisation),
     )
 
 
@@ -250,23 +294,23 @@ def test_draw_auto_regression_correlated_wrong_input(ar_params_2D, covariance, d
 
 
 @pytest.mark.parametrize("time", (3, 5))
-@pytest.mark.parametrize("realization", (2, 4))
+@pytest.mark.parametrize("realisation", (2, 4))
 @pytest.mark.parametrize("time_dim", ("time", "ts"))
-@pytest.mark.parametrize("realization_dim", ("realization", "sample"))
+@pytest.mark.parametrize("realisation_dim", ("realisation", "sample"))
 @pytest.mark.parametrize("seed", [0, xr.Dataset({"seed": 0})])
 def test_draw_auto_regression_correlated(
-    ar_params_2D, covariance, time, realization, time_dim, realization_dim, seed
+    ar_params_2D, covariance, time, realisation, time_dim, realisation_dim, seed
 ):
 
     result = mesmer.stats.draw_auto_regression_correlated(
         ar_params_2D,
         covariance,
         time=time,
-        realisation=realization,
+        realisation=realisation,
         seed=seed,
         buffer=0,
         time_dim=time_dim,
-        realisation_dim=realization_dim,
+        realisation_dim=realisation_dim,
     )
 
     n_gridcells = ar_params_2D.intercept.size
@@ -275,8 +319,52 @@ def test_draw_auto_regression_correlated(
         result,
         "result",
         ndim=3,
-        required_dims={time_dim, "gridcell", realization_dim},
-        shape=(time, n_gridcells, realization),
+        required_dims={time_dim, "gridcell", realisation_dim},
+        shape=(time, n_gridcells, realisation),
+    )
+
+
+def test_draw_auto_regression_correlated_dt(ar_params_2D, covariance):
+    seeds = DataTree.from_dict(
+        {
+            "scen1": xr.DataArray(np.array([25]), name="seed"),
+            "scen2": xr.DataArray(np.array([42]), name="seed"),
+        }
+    )
+    n_realisation = 10
+    n_ts = 20
+
+    result = map_over_subtree(mesmer.stats.draw_auto_regression_correlated)(
+        ar_params_2D,
+        covariance,
+        time=n_ts,
+        realisation=n_realisation,
+        seed=seeds,
+        buffer=10,
+        time_dim="time",
+        realisation_dim="realisation",
+    )
+
+    assert result["scen1"].to_dataset().var() is not result["scen2"].to_dataset().var()
+    _check_dataset_form(
+        result["scen1"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataset_form(
+        result["scen2"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataarray_form(
+        result["scen1"].samples,
+        "samples",
+        ndim=3,
+        required_dims={"time", "gridcell", "realisation"},
+        shape=(n_ts, 2, n_realisation),
+    )
+    _check_dataarray_form(
+        result["scen2"].samples,
+        "samples",
+        ndim=3,
+        required_dims={"time", "gridcell", "realisation"},
+        shape=(n_ts, 2, n_realisation),
     )
 
 
@@ -604,8 +692,8 @@ def test_fit_autoregression_monthly_np_no_noise(slope, intercept):
     # test if autoregrerssion can fit using previous month as independent variable
     # and current month as dependent variable
     n_ts = 100
-    np.random.seed(0)
-    prev_month = np.random.normal(size=n_ts)
+    rng = np.random.default_rng(seed=0)
+    prev_month = rng.normal(size=n_ts)
     cur_month = prev_month * slope + intercept
 
     result = mesmer.stats._auto_regression._fit_auto_regression_monthly_np(
@@ -626,9 +714,9 @@ def test_fit_autoregression_monthly_np_with_noise(slope, intercept, std):
     # test if autoregrerssion can fit using previous month as independent variable
     # and current month as dependent variable
     n_ts = 5000
-    np.random.seed(0)
-    prev_month = np.random.normal(size=n_ts)
-    cur_month = prev_month * slope + intercept + np.random.normal(0, std, size=n_ts)
+    rng = np.random.default_rng(seed=0)
+    prev_month = rng.normal(size=n_ts)
+    cur_month = prev_month * slope + intercept + rng.normal(0, std, size=n_ts)
 
     result = mesmer.stats._auto_regression._fit_auto_regression_monthly_np(
         cur_month, prev_month
@@ -644,9 +732,10 @@ def test_fit_auto_regression_monthly():
     freq = "ME" if Version(pd.__version__) >= Version("2.2") else "M"
     n_years = 20
     n_gridcells = 10
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
+
     data = xr.DataArray(
-        np.random.normal(size=(n_years * 12, n_gridcells)),
+        rng.normal(size=(n_years * 12, n_gridcells)),
         dims=("time", "gridcell"),
         coords={
             "time": pd.date_range("2000-01-01", periods=n_years * 12, freq=freq),
@@ -684,7 +773,8 @@ def test_draw_auto_regression_monthly_np_buffer(buffer):
     n_realisations = 1
     n_gridcells = 10
     seed = 0
-    slope = np.random.uniform(-1, 1, size=(12, n_gridcells))
+    rng = np.random.default_rng(seed=seed)
+    slope = rng.uniform(-1, 1, size=(12, n_gridcells))
     intercept = np.ones((12, n_gridcells))
     covariance = np.zeros((12, n_gridcells, n_gridcells))
     n_ts = 120
@@ -734,22 +824,23 @@ def test_draw_autoregression_monthly_np_rng():
     assert np.not_equal(jan, feb).any()
 
 
-def test_draw_auto_regression_monthly():
+@pytest.mark.parametrize("seed", [0, xr.Dataset({"seed": 0})])
+def test_draw_auto_regression_monthly(seed):
     freq = "ME" if Version(pd.__version__) >= Version("2.2") else "M"
 
     n_gridcells = 10
     n_realisations = 5
     n_years = 10
-    seed = 0
     buffer = 10
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
+
     slopes = xr.DataArray(
-        np.random.normal(-0.99, 0.99, size=(12, n_gridcells)),
+        rng.normal(-0.99, 0.99, size=(12, n_gridcells)),
         dims=("month", "gridcell"),
         coords={"month": np.arange(1, 13), "gridcell": np.arange(n_gridcells)},
     )
     intercepts = xr.DataArray(
-        np.random.normal(-10, 10, size=(12, n_gridcells)),
+        rng.normal(-10, 10, size=(12, n_gridcells)),
         dims=("month", "gridcell"),
         coords={"month": np.arange(1, 13), "gridcell": np.arange(n_gridcells)},
     )
@@ -784,3 +875,78 @@ def test_draw_auto_regression_monthly():
         required_dims={"time", "gridcell", "realisation"},
         shape=(len(time), n_gridcells, n_realisations),
     )
+
+
+def test_draw_auto_regression_monthly_dt():
+
+    seeds = DataTree.from_dict(
+        {
+            "scen1": xr.DataArray(np.array([25]), name="seed"),
+            "scen2": xr.DataArray(np.array([42]), name="seed"),
+        }
+    )
+
+    freq = "ME" if Version(pd.__version__) >= Version("2.2") else "M"
+
+    n_gridcells = 10
+    n_realisation = 5
+    n_years = 10
+    buffer = 10
+
+    rng = np.random.default_rng(seed=0)
+
+    slopes = xr.DataArray(
+        rng.normal(-0.99, 0.99, size=(12, n_gridcells)),
+        dims=("month", "gridcell"),
+        coords={"month": np.arange(1, 13), "gridcell": np.arange(n_gridcells)},
+    )
+    intercepts = xr.DataArray(
+        rng.normal(-10, 10, size=(12, n_gridcells)),
+        dims=("month", "gridcell"),
+        coords={"month": np.arange(1, 13), "gridcell": np.arange(n_gridcells)},
+    )
+    ar_params = xr.Dataset({"intercept": intercepts, "slope": slopes})
+
+    covariance = xr.DataArray(
+        np.tile(np.eye(n_gridcells), [12, 1, 1]),
+        dims=("month", "gridcell_i", "gridcell_j"),
+        coords={
+            "month": np.arange(1, 13),
+            "gridcell_i": np.arange(n_gridcells),
+            "gridcell_j": np.arange(n_gridcells),
+        },
+    )
+
+    time = pd.date_range("2000-01-01", periods=n_years * 12, freq=freq)
+    time = xr.DataArray(time, dims="time", coords={"time": time})
+
+    result = map_over_subtree(mesmer.stats.draw_auto_regression_monthly)(
+        ar_params,
+        covariance,
+        time=time,
+        n_realisations=n_realisation,
+        seed=seeds,
+        buffer=buffer,
+    )
+
+    assert result["scen1"].to_dataset().var() is not result["scen2"].to_dataset().var()
+    _check_dataset_form(
+        result["scen1"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataset_form(
+        result["scen2"].to_dataset(), "result", required_vars={"samples"}
+    )
+    _check_dataarray_form(
+        result["scen1"].samples,
+        "samples",
+        ndim=3,
+        required_dims={"time", "gridcell", "realisation"},
+        shape=(n_years * 12, n_gridcells, n_realisation),
+    )
+    _check_dataarray_form(
+        result["scen2"].samples,
+        "samples",
+        ndim=3,
+        required_dims={"time", "gridcell", "realisation"},
+        shape=(n_years * 12, n_gridcells, n_realisation),
+    )