inplace refactor train_lv.py (#167)

* inplace refactor train_lv.py * extract _adjust_ecov_ar1 to function * linting
MESMER-group · Jul 15, 2022 · 723eab5 · 723eab5
1 parent 4ea22aa
commit 723eab5
Showing 1 changed file with 79 additions and 37 deletions.
diff --git a/mesmer/calibrate_mesmer/train_lv.py b/mesmer/calibrate_mesmer/train_lv.py
@@ -250,26 +250,65 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
             params_lv["loc_ecov"][targ_name],
         ) = train_lv_find_localized_ecov(y[targ_name], wgt_scen_eq, aux, cfg)
 
-        # ATTENTION: STILL NEED TO CHECK IF THIS IS TRUE. I UNFORTUNATELY LEARNED THAT I
-        # WROTE THIS FORMULA DIFFERENTLY IN THE ESD PAPER!!!!!!! (But I am pretty sure
-        # that code is correct and the error is in the paper)
         # compute localized cov matrix of the innovations of the AR(1) process
-        loc_ecov_AR1_innovs = np.zeros(params_lv["loc_ecov"][targ_name].shape)
-        for i in np.arange(nr_gps):
-            for j in np.arange(nr_gps):
-                loc_ecov_AR1_innovs[i, j] = (
-                    np.sqrt(1 - params_lv["AR1_coef"][targ_name][i] ** 2)
-                    * np.sqrt(1 - params_lv["AR1_coef"][targ_name][j] ** 2)
-                    * params_lv["loc_ecov"][targ_name][i, j]
-                )
+        loc_ecov_AR1_innovs = _adjust_ecov_ar1(
+            params_lv["loc_ecov"][targ_name], params_lv["AR1_coef"][targ_name]
+        )
 
         params_lv["loc_ecov_AR1_innovs"][targ_name] = loc_ecov_AR1_innovs
-        # derive the localized ecov of the innovations of the AR(1) process (ie the one
-        # I will later draw innovs from)
 
     return params_lv
 
 
+def _adjust_ecov_ar1(ecov, ar_coefs):
+    """adjust localized empirical covariance matrix for autoregressive process of order 1
+
+    Parameters
+    ----------
+    ecov : 2D np.array
+        Empirical covariance matrix.
+    ar_coefs : 1D np.array
+        The coefficients of the autoregressive process of order 1.
+
+    Returns
+    -------
+    adjusted_ecov : np.array
+        Adjusted empirical covariance matrix.
+
+    Notes
+    -----
+    - Adjusts ``ecov`` for an AR(1) process according to [1]_, eq (8).
+
+    - The formula is specific for an AR(1) process, see also https://github.com/MESMER-group/mesmer/pull/167#discussion_r912481495
+
+    - According to [2]_ "The multiplication with the ``reduction_factor`` scales the
+      empirical standard error under the assumption of an autoregressive process of
+      order 1 [3]_. This accounts for the fact that the variance of an autoregressive
+      process is larger than that of the driving white noise process."
+
+    - This formula is wrong in [1]_. However, it is correct in the code. See also [2]_
+       and [3]_.
+
+    .. [1] Beusch, L., Gudmundsson, L., and Seneviratne, S. I.: Emulating Earth system model
+       temperatures with MESMER: from global mean temperature trajectories to grid-point-
+       level realizations on land, Earth Syst. Dynam., 11, 139–159,
+       https://doi.org/10.5194/esd-11-139-2020, 2020.
+
+    .. [2] Humphrey, V. and Gudmundsson, L.: GRACE-REC: a reconstruction of climate-driven
+       water storage changes over the last century, Earth Syst. Sci. Data, 11, 1153–1170,
+       https://doi.org/10.5194/essd-11-1153-2019, 2019.
+
+    .. [3] Cressie, N. and Wikle, C. K.: Statistics for spatio-temporal data, John Wiley &
+       Sons, Hoboken, New Jersey, USA, 2011.
+    """
+
+    reduction_factor = np.sqrt(1 - ar_coefs**2)
+    reduction_factor = np.atleast_2d(reduction_factor)  # so it can be transposed
+
+    # equivalent to ``diag(reduction_factor) @ ecov @ diag(reduction_factor)``
+    return reduction_factor * reduction_factor.T * ecov
+
+
 def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
     """
     Find suitable localization radius for empirical covariance matrix and derive
@@ -306,31 +345,30 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
 
     """
 
+    return _find_localized_empirical_covariance(
+        y, wgt_scen_eq, aux["phi_gc"], cfg.max_iter_cv
+    )
+
+
+def _find_localized_empirical_covariance(y, wgt_scen_eq, phi_gc, max_iter_cv):
+
     # derive the indices for the cross validation
-    max_iter_cv = cfg.max_iter_cv
-    nr_samples = y.shape[0]
+    nr_samples, nr_gridpoints = y.shape
     nr_it = np.min([nr_samples, max_iter_cv])
     idx_cv_out = np.zeros([nr_it, nr_samples], dtype=bool)
-    for i in np.arange(nr_it):
+    for i in range(nr_it):
         idx_cv_out[i, i::max_iter_cv] = True
 
     # spatial cross-correlations with specified cross val folds
-    L_set = np.sort(list(aux["phi_gc"].keys()))  # the Ls to loop through
+    L_set = sorted(phi_gc.keys())  # the localisation radii to loop through
 
     llh_max = float("-inf")
     llh_cv_sum = {}
-    idx_L = 0
-    L_sel = L_set[idx_L]
-    idx_break = False
-
-    while (idx_break is False) and (L_sel < L_set[-1]):
-        # experience tells: once stop selecting larger loc radii, will not start again
-        # better to stop once max is reached (to limit computational effort + amount of
-        # singular matrices)
-        L = L_set[idx_L]
+
+    for L in L_set:
         llh_cv_sum[L] = 0
 
-        for it in np.arange(nr_it):
+        for it in range(nr_it):
             # extract folds
             y_est = y[~idx_cv_out[it]]  # to estimate params
             y_cv = y[idx_cv_out[it]]  # to crossvalidate the estimate
@@ -339,10 +377,12 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
 
             # compute ecov and likelihood of out fold to be drawn from it
             ecov = np.cov(y_est, rowvar=False, aweights=wgt_scen_eq_est)
-            loc_ecov = aux["phi_gc"][L] * ecov
-            # we want the mean of the res to be 0
-            mean_0 = np.zeros(aux["phi_gc"][L].shape[0])
+            loc_ecov = phi_gc[L] * ecov
 
+            # we want the mean of the normal distribution to be 0
+            mean_0 = np.zeros(nr_gridpoints)
+
+            # NOTE: 90 % of time is spent here - not much point optimizing the rest
             llh_cv_each_sample = multivariate_normal.logpdf(
                 y_cv, mean=mean_0, cov=loc_ecov, allow_singular=True
             )
@@ -354,24 +394,26 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
             # each cv sample gets its own likelihood -> can sum them up for overall
             # likelihood
             # sum over all samples = wgt average * nr_samples
-            llh_cv_fold_sum = np.average(
-                llh_cv_each_sample, weights=wgt_scen_eq_cv
-            ) * len(wgt_scen_eq_cv)
+            llh_cv_fold_sum = (
+                np.average(llh_cv_each_sample, weights=wgt_scen_eq_cv)
+                * wgt_scen_eq_cv.size
+            )
 
             # add to full sum over all folds
             llh_cv_sum[L] += llh_cv_fold_sum
 
-        idx_L += 1
-
+        # experience tells: once stop selecting larger localisation radii, will not
+        # start again. Better to stop once max is reached (to limit computational effort
+        # and amount of singular matrices).
         if llh_cv_sum[L] > llh_max:
             L_sel = L
             llh_max = llh_cv_sum[L]
             print("Newly selected L =", L_sel)
         else:
             print("Final selected L =", L_sel)
-            idx_break = True
+            break
 
     ecov = np.cov(y, rowvar=False, aweights=wgt_scen_eq)
-    loc_ecov = aux["phi_gc"][L_sel] * ecov
+    loc_ecov = phi_gc[L_sel] * ecov
 
     return L_sel, ecov, loc_ecov