Skip to content

Commit

Permalink
inplace refactor train_lv.py (#167)
Browse files Browse the repository at this point in the history
* inplace refactor train_lv.py

* extract _adjust_ecov_ar1 to function

* linting
  • Loading branch information
mathause authored Jul 15, 2022
1 parent 4ea22aa commit 723eab5
Showing 1 changed file with 79 additions and 37 deletions.
116 changes: 79 additions & 37 deletions mesmer/calibrate_mesmer/train_lv.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,26 +250,65 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
params_lv["loc_ecov"][targ_name],
) = train_lv_find_localized_ecov(y[targ_name], wgt_scen_eq, aux, cfg)

# ATTENTION: STILL NEED TO CHECK IF THIS IS TRUE. I UNFORTUNATELY LEARNED THAT I
# WROTE THIS FORMULA DIFFERENTLY IN THE ESD PAPER!!!!!!! (But I am pretty sure
# that code is correct and the error is in the paper)
# compute localized cov matrix of the innovations of the AR(1) process
loc_ecov_AR1_innovs = np.zeros(params_lv["loc_ecov"][targ_name].shape)
for i in np.arange(nr_gps):
for j in np.arange(nr_gps):
loc_ecov_AR1_innovs[i, j] = (
np.sqrt(1 - params_lv["AR1_coef"][targ_name][i] ** 2)
* np.sqrt(1 - params_lv["AR1_coef"][targ_name][j] ** 2)
* params_lv["loc_ecov"][targ_name][i, j]
)
loc_ecov_AR1_innovs = _adjust_ecov_ar1(
params_lv["loc_ecov"][targ_name], params_lv["AR1_coef"][targ_name]
)

params_lv["loc_ecov_AR1_innovs"][targ_name] = loc_ecov_AR1_innovs
# derive the localized ecov of the innovations of the AR(1) process (ie the one
# I will later draw innovs from)

return params_lv


def _adjust_ecov_ar1(ecov, ar_coefs):
"""adjust localized empirical covariance matrix for autoregressive process of order 1
Parameters
----------
ecov : 2D np.array
Empirical covariance matrix.
ar_coefs : 1D np.array
The coefficients of the autoregressive process of order 1.
Returns
-------
adjusted_ecov : np.array
Adjusted empirical covariance matrix.
Notes
-----
- Adjusts ``ecov`` for an AR(1) process according to [1]_, eq (8).
- The formula is specific for an AR(1) process, see also https://github.com/MESMER-group/mesmer/pull/167#discussion_r912481495
- According to [2]_ "The multiplication with the ``reduction_factor`` scales the
empirical standard error under the assumption of an autoregressive process of
order 1 [3]_. This accounts for the fact that the variance of an autoregressive
process is larger than that of the driving white noise process."
- This formula is wrong in [1]_. However, it is correct in the code. See also [2]_
and [3]_.
.. [1] Beusch, L., Gudmundsson, L., and Seneviratne, S. I.: Emulating Earth system model
temperatures with MESMER: from global mean temperature trajectories to grid-point-
level realizations on land, Earth Syst. Dynam., 11, 139–159,
https://doi.org/10.5194/esd-11-139-2020, 2020.
.. [2] Humphrey, V. and Gudmundsson, L.: GRACE-REC: a reconstruction of climate-driven
water storage changes over the last century, Earth Syst. Sci. Data, 11, 1153–1170,
https://doi.org/10.5194/essd-11-1153-2019, 2019.
.. [3] Cressie, N. and Wikle, C. K.: Statistics for spatio-temporal data, John Wiley &
Sons, Hoboken, New Jersey, USA, 2011.
"""

reduction_factor = np.sqrt(1 - ar_coefs**2)
reduction_factor = np.atleast_2d(reduction_factor) # so it can be transposed

# equivalent to ``diag(reduction_factor) @ ecov @ diag(reduction_factor)``
return reduction_factor * reduction_factor.T * ecov


def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
"""
Find suitable localization radius for empirical covariance matrix and derive
Expand Down Expand Up @@ -306,31 +345,30 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
"""

return _find_localized_empirical_covariance(
y, wgt_scen_eq, aux["phi_gc"], cfg.max_iter_cv
)


def _find_localized_empirical_covariance(y, wgt_scen_eq, phi_gc, max_iter_cv):

# derive the indices for the cross validation
max_iter_cv = cfg.max_iter_cv
nr_samples = y.shape[0]
nr_samples, nr_gridpoints = y.shape
nr_it = np.min([nr_samples, max_iter_cv])
idx_cv_out = np.zeros([nr_it, nr_samples], dtype=bool)
for i in np.arange(nr_it):
for i in range(nr_it):
idx_cv_out[i, i::max_iter_cv] = True

# spatial cross-correlations with specified cross val folds
L_set = np.sort(list(aux["phi_gc"].keys())) # the Ls to loop through
L_set = sorted(phi_gc.keys()) # the localisation radii to loop through

llh_max = float("-inf")
llh_cv_sum = {}
idx_L = 0
L_sel = L_set[idx_L]
idx_break = False

while (idx_break is False) and (L_sel < L_set[-1]):
# experience tells: once stop selecting larger loc radii, will not start again
# better to stop once max is reached (to limit computational effort + amount of
# singular matrices)
L = L_set[idx_L]

for L in L_set:
llh_cv_sum[L] = 0

for it in np.arange(nr_it):
for it in range(nr_it):
# extract folds
y_est = y[~idx_cv_out[it]] # to estimate params
y_cv = y[idx_cv_out[it]] # to crossvalidate the estimate
Expand All @@ -339,10 +377,12 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):

# compute ecov and likelihood of out fold to be drawn from it
ecov = np.cov(y_est, rowvar=False, aweights=wgt_scen_eq_est)
loc_ecov = aux["phi_gc"][L] * ecov
# we want the mean of the res to be 0
mean_0 = np.zeros(aux["phi_gc"][L].shape[0])
loc_ecov = phi_gc[L] * ecov

# we want the mean of the normal distribution to be 0
mean_0 = np.zeros(nr_gridpoints)

# NOTE: 90 % of time is spent here - not much point optimizing the rest
llh_cv_each_sample = multivariate_normal.logpdf(
y_cv, mean=mean_0, cov=loc_ecov, allow_singular=True
)
Expand All @@ -354,24 +394,26 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
# each cv sample gets its own likelihood -> can sum them up for overall
# likelihood
# sum over all samples = wgt average * nr_samples
llh_cv_fold_sum = np.average(
llh_cv_each_sample, weights=wgt_scen_eq_cv
) * len(wgt_scen_eq_cv)
llh_cv_fold_sum = (
np.average(llh_cv_each_sample, weights=wgt_scen_eq_cv)
* wgt_scen_eq_cv.size
)

# add to full sum over all folds
llh_cv_sum[L] += llh_cv_fold_sum

idx_L += 1

# experience tells: once stop selecting larger localisation radii, will not
# start again. Better to stop once max is reached (to limit computational effort
# and amount of singular matrices).
if llh_cv_sum[L] > llh_max:
L_sel = L
llh_max = llh_cv_sum[L]
print("Newly selected L =", L_sel)
else:
print("Final selected L =", L_sel)
idx_break = True
break

ecov = np.cov(y, rowvar=False, aweights=wgt_scen_eq)
loc_ecov = aux["phi_gc"][L_sel] * ecov
loc_ecov = phi_gc[L_sel] * ecov

return L_sel, ecov, loc_ecov

0 comments on commit 723eab5

Please sign in to comment.