remove ensemble anomaly type 'first' (#247)

* remove ensemble anomaly type 'first' * CHANGELOG
MESMER-group · Jan 22, 2023 · f158f94 · f158f94
1 parent f5b36fb
commit f158f94
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 57 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -85,7 +85,9 @@ Breaking changes
   and the ``reg_dict`` argument to :py:func:`mesmer.utils.select.extract_land`. These arguments
   no longer have any affect (`#235 <https://github.com/MESMER-group/mesmer/pull/235>`_).
   By `Mathias Hauser <https://github.com/mathause>`_.
-
+- Removed ``ref["type"] == "first"``, i.e., caculating the anomaly w.r.t. the first
+  ensemble member (`#247 <https://github.com/MESMER-group/mesmer/pull/247>`_).
+  By `Mathias Hauser <https://github.com/mathause>`_.
 
 Deprecations
 ^^^^^^^^^^^^

diff --git a/mesmer/io/_load_cmipng.py b/mesmer/io/_load_cmipng.py
@@ -152,53 +152,33 @@ def find_files_cmipng(gen, esm, var, scenario, dir_cmipng):
             runs_excl = []
             print("TO DO: create list of excluded runs / ESMs for this variable")
 
-        # print(
-        #   "TO DO: rewrite selection of p2 in way that needs less space for CanESM5 (+ see if other ESMs need p2 too)"
+        #  TODO: how to handle p2?
         # )
         path = os.path.join(dir_name, f"{var}_ann_{esm}_{scenario}_r*i1p1f*_g025.nc")
         path_runs_list_scen = sorted(glob.glob(path))
 
         path = os.path.join(dir_name, f"{var}_ann_{esm}_historical_r*i1p1f*_g025.nc")
         path_runs_list_hist = sorted(glob.glob(path))
 
-        # check if both scenario and historical run are available for this realization, if yes add to path_runs_list
+        # check if both scenario and historical run are available for this realization,
+        # if yes add to path_runs_list
         path_runs_list = []
         for run in path_runs_list_scen:
             run_hist = run.replace(scenario, "historical")
             if run_hist in path_runs_list_hist:
                 path_runs_list.append(run)
 
-        # TODO: redecide if I am fine with CanESM5 p2 but not all scenarios or if I prefer the worse p1 which has all scenarios
-        # code below = old version when used p2 instead
-        # if esm != "CanESM5":
-        #    path_runs_list = sorted(
-        #       glob.glob(
-        #   dir_name
-        #  + var
-        # + "_ann_"
-        # + esm
-        # + "_"
-        #      + scenario
-        #     + "_"
-        #    + "r*i1p1f*"
-        #   + "_g025.nc"
-        #   )
-        # )
-
+        # TODO: redecide if I am fine with CanESM5 p2 but not all scenarios or if I
+        # prefer the worse p1 which has all scenarios code below = old version when used
+        #  p2 instead
+        # if esm == "CanESM5":
+        #     variant = "r*i1p2f*"
         # else:
-        #   path_runs_list = sorted(
-        #      glob.glob(
-        #         dir_name
-        #        + var
-        #       + "_ann_"
-        #      + esm
-        #     + "_"
-        #    + scenario
-        #   + "_"
-        #  + "r*i1p2f*"
-        # + "_g025.nc"
-        #    )
-        # )
+        #     variant = "r*i1p1f*"
+
+        # path = os.path.join(dir_name, f"{var}_ann_{esm}_{scenario}_{variant}_g025.nc")
+        # path_runs_list = sorted(glob.glob(path))
+
     if len(path_runs_list) == 0:  # if no entries found, return the empty list
         return path_runs_list
 
@@ -266,8 +246,9 @@ def load_cmipng(targ, esm, scen, cfg):
             "No version without historical time period is currently implemented."
         )
 
+    # once start working with other vars, extend dict eg {"pr": load_cmipng_pr,
+    # "hfds": load_cmipng_hfds, "tas": load_cmipng_tas}
     targ_func_mapping = {"hfds": load_cmipng_hfds, "tas": load_cmipng_tas}
-    # once start working with other vars, extend dict eg {"pr": load_cmipng_pr, "hfds": load_cmipng_hfds, "tas": load_cmipng_tas}
 
     load_targ = targ_func_mapping[targ]
 
@@ -420,6 +401,9 @@ def _load_cmipng_var(esm, scen, cfg, varn):
     ref = cfg.ref
     dir_cmipng = cfg.dir_cmipng
 
+    if ref["type"] == "first":
+        raise ValueError("reference type 'first' is no longer supported")
+
     # find the files which fulfill the specifications
     path_runs_list = find_files_cmipng(gen, esm, varn, scen, dir_cmipng)
 
@@ -468,17 +452,7 @@ def _load_cmipng_var(esm, scen, cfg, varn):
             )
             dta[run] -= dta_ref  # compute anomalies
 
-        # TODO: remove "first" (used for Leas first paper but does not really make sense)
-        if ref["type"] == "first" and run == "1":
-            # TO CHECK
-            dta_ref = (
-                data[varn]
-                .sel(time=slice(ref["start"], ref["end"]))
-                .mean(dim="time")
-                .values
-            )
-
-    if ref["type"] == "all" or ref["type"] == "first":
+    if ref["type"] == "all":
         for run_path in path_runs_list:
             run = run_nrs[run_path]
             dta[run] -= dta_ref  # compute anomalies

diff --git a/tests/integration/test_load_cmipng.py b/tests/integration/test_load_cmipng.py
@@ -1,7 +1,6 @@
 import os
 
 import numpy as np
-import numpy.testing as npt
 import pytest
 
 import mesmer
@@ -11,7 +10,6 @@
 
 def _get_default_kwargs(
     test_data_root_dir,
-    varn=None,
     esm="IPSL-CM6A-LR",
     scen="ssp126",
     type="all",
@@ -50,6 +48,14 @@ def test_load_cmipng_var_missing_data():
     assert time is None
 
 
+def test_load_cmipng_var_type_first_error(test_data_root_dir):
+
+    with pytest.raises(ValueError, match="reference type 'first'"):
+        _load_cmipng_var(
+            varn="tas", **_get_default_kwargs(test_data_root_dir, type="first")
+        )
+
+
 @pytest.mark.parametrize("varn", ["tas", "hfds"])
 def test_load_cmipng(test_data_root_dir, varn):
 
@@ -68,16 +74,73 @@ def test_load_cmipng(test_data_root_dir, varn):
     assert dta[1].shape == (251, 20, 20)
     assert dta_global[1].shape == (251,)
 
-    npt.assert_allclose(lon["c"], lon_expected)
-    npt.assert_allclose(lat["c"], lat_expected)
-    npt.assert_allclose(time, time_expected)
+    np.testing.assert_allclose(lon["c"], lon_expected)
+    np.testing.assert_allclose(lat["c"], lat_expected)
+    np.testing.assert_allclose(time, time_expected)
 
 
 @pytest.mark.parametrize("varn", ["tas", "hfds"])
 @pytest.mark.parametrize("start, end", [("1850", "1900"), ("1850", "1860")])
-def test_load_cmipng_ref(test_data_root_dir, varn, start, end):
+def test_load_cmipng_ref_two_ens_all(test_data_root_dir, varn, start, end):
+
+    dta, dta_global, lon, lat, time = _load_cmipng_var(
+        varn=varn,
+        **_get_default_kwargs(test_data_root_dir, scen="ssp585", start=start, end=end),
+    )
+    time = np.asarray(time)
+    sel = (time >= int(start)) & (time <= int(end))
+
+    # stack ensemble members, new axis at end
+    dta_arr = np.stack(list(dta.values()), axis=-1)
+    dta_global_arr = np.stack(list(dta_global.values()), axis=-1)
+
+    # test mean over all grid points is 0 (over reference period AND ensembles)
+    np.testing.assert_allclose(0, np.nanmean(dta_arr[sel, :, :]), atol=1e-6)
+
+    # test individual gridpoints are 0 (over reference period)
+    result = np.mean(dta_arr[sel, :, :], axis=(0, -1))
+
+    print(result.shape)
+    expected = np.zeros_like(result)
+    expected[np.isnan(result)] = np.NaN
+    np.testing.assert_allclose(expected, result, atol=1e-6)
+
+    # test global mean is 0 (over reference period)
+    np.testing.assert_allclose(0, np.mean(dta_global_arr[sel]), atol=1e-6)
+
 
-    # TODO: test ref["type"] once there is more than one test dataset
+@pytest.mark.parametrize("varn", ["tas", "hfds"])
+@pytest.mark.parametrize("start, end", [("1850", "1900"), ("1850", "1860")])
+def test_load_cmipng_ref_two_ens_indiv(test_data_root_dir, varn, start, end):
+
+    dta, dta_global, lon, lat, time = _load_cmipng_var(
+        varn=varn,
+        **_get_default_kwargs(
+            test_data_root_dir, scen="ssp585", type="individ", start=start, end=end
+        ),
+    )
+    time = np.asarray(time)
+    sel = (time >= int(start)) & (time <= int(end))
+
+    # test mean over all grid points is 0 (over reference period)
+    for arr in dta.values():
+        np.testing.assert_allclose(0, np.nanmean(arr[sel, :, :]), atol=1e-6)
+
+    # test individual gridpoints are 0 (over reference period)
+    for arr in dta.values():
+        result = np.mean(arr[sel, :, :], axis=0)
+        expected = np.zeros_like(result)
+        expected[np.isnan(result)] = np.NaN
+        np.testing.assert_allclose(expected, result, atol=1e-6)
+
+    # test global mean is 0 (over reference period)
+    for arr_global in dta_global.values():
+        np.testing.assert_allclose(0, np.mean(arr_global[sel]), atol=1e-6)
+
+
+@pytest.mark.parametrize("varn", ["tas", "hfds"])
+@pytest.mark.parametrize("start, end", [("1850", "1900"), ("1850", "1860")])
+def test_load_cmipng_ref(test_data_root_dir, varn, start, end):
 
     dta, dta_global, lon, lat, time = _load_cmipng_var(
         varn=varn, **_get_default_kwargs(test_data_root_dir, start=start, end=end)
@@ -86,16 +149,16 @@ def test_load_cmipng_ref(test_data_root_dir, varn, start, end):
     sel = (time >= int(start)) & (time <= int(end))
 
     # test mean over all grid points is 0 (over reference period)
-    npt.assert_allclose(0, np.nanmean(dta[1][sel, :, :]), atol=1e-6)
+    np.testing.assert_allclose(0, np.nanmean(dta[1][sel, :, :]), atol=1e-6)
 
     # test individual gridpoints are 0 (over reference period)
     result = np.mean(dta[1][sel, :, :], axis=0)
     expected = np.zeros_like(result)
     expected[np.isnan(result)] = np.NaN
-    npt.assert_allclose(expected, result, atol=1e-6)
+    np.testing.assert_allclose(expected, result, atol=1e-6)
 
     # test global mean is 0 (over reference period)
-    npt.assert_allclose(0, np.mean(dta_global[1][sel]), atol=1e-6)
+    np.testing.assert_allclose(0, np.mean(dta_global[1][sel]), atol=1e-6)
 
 
 @pytest.mark.parametrize("varn", ["tas", "hfds"])
@@ -118,4 +181,4 @@ def test_load_cmimpng_vs_load_var(test_data_root_dir, varn):
     assert_dict_allclose(dta_global_d, dta_global_i, "direct", "indirect")
     assert_dict_allclose(lon_d, lon_i, "direct", "indirect")
     assert_dict_allclose(lat_d, lat_i, "direct", "indirect")
-    npt.assert_allclose(time_d, time_i)
+    np.testing.assert_allclose(time_d, time_i)