From 9a6020c7ddd52b3dcf131043ad6adb297ae8c0c5 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 1 Jul 2022 11:12:06 +0200 Subject: [PATCH 1/9] extract select_ar_order --- mesmer/calibrate_mesmer/train_gv.py | 41 ++++++----------- mesmer/core/auto_regression.py | 70 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 65235349..4d182403 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -13,10 +13,8 @@ import numpy as np import statsmodels.api as sm import xarray as xr -from packaging.version import Version -from statsmodels.tsa.ar_model import ar_select_order -from mesmer.core.auto_regression import _fit_auto_regression_xr +from mesmer.core.auto_regression import _fit_auto_regression_xr, _select_ar_order_xr def train_gv(gv, targ, esm, cfg, save_params=True, **kwargs): @@ -168,32 +166,21 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): params_gv["sel_crit"] = sel_crit # select the AR Order - nr_scens = len(gv.keys()) - AR_order_scens_tmp = np.zeros(nr_scens) + AR_order_scen = list() + for scen in gv.keys(): - # np.percentile renamed the keyword in numpy v1.22 - if Version(np.__version__) >= Version("1.22.0"): - pct_kwargs = {"q": 50, "method": "nearest"} - else: - pct_kwargs = {"q": 50, "interpolation": "nearest"} + # create temporary DataArray + data = xr.DataArray(gv[scen], dims=["run", "time"]) - for scen_idx, scen in enumerate(gv.keys()): - nr_runs = gv[scen].shape[0] - AR_order_runs_tmp = np.zeros(nr_runs) - - for run in np.arange(nr_runs): - run_ar_lags = ar_select_order( - gv[scen][run], maxlag=max_lag, ic=sel_crit, old_names=False - ).ar_lags - # if order > 0 is selected,add selected order to vector - if len(run_ar_lags) > 0: - AR_order_runs_tmp[run] = run_ar_lags[-1] - - # interpolation is not a good way to go here because it could lead to an AR - # order that wasn't chosen by run -> avoid it by just taking nearest - AR_order_scens_tmp[scen_idx] = np.percentile(AR_order_runs_tmp, **pct_kwargs) - - AR_order_sel = int(np.percentile(AR_order_scens_tmp, **pct_kwargs)) + AR_order = _select_ar_order_xr(data, "time", maxlag=max_lag, criterion=sel_crit) + + # median over all ensemble members ("nearest" ensures an 'existing' lag is selected) + AR_order = AR_order.quantile(q=0.5, method="nearest") + AR_order_scen.append(AR_order) + + # median over all scenarios + AR_order_scen = xr.concat(AR_order_scen, dim="scen") + AR_order_sel = int(AR_order.quantile(q=0.5, method="nearest").item()) # determine the AR params for the selected AR order params_scen = list() diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py index 222e9043..c661d696 100644 --- a/mesmer/core/auto_regression.py +++ b/mesmer/core/auto_regression.py @@ -2,6 +2,76 @@ import xarray as xr +def _select_ar_order_xr(data, dim, maxlag, ic="bic"): + """Select the order of an autoregressive AR-X(p) process - xarray wrapper + + Parameters + ---------- + data : array_like + A 1-d endogenous response variable. The independent variable. + dim : str + Dimension along which to determine the order. + maxlag : int + The maximum lag to consider. + ic : {'aic', 'hqic', 'bic'}, default 'bic' + The information criterion to use in the selection. + + Returns + ------- + selected_order : DataArray + Array indicating the selected order with the same size as the input but ``dim`` + removed. + + Notes + ----- + Only full models can be selected. + """ + + selected_order = xr.apply_ufunc( + _select_order_np, + data, + input_core_dims=[[dim]], + output_core_dims=((),), + vectorize=True, + output_dtypes=[int], + kwargs={"maxlag": maxlag, "ic": ic}, + ) + + # remove zeros + selected_order[selected_order == 0] = np.NaN + + return selected_order + + +def _select_order_np(data, maxlag, ic="bic"): + """Select the order of an autoregressive AR-X(p) process - numpy wrapper + + Parameters + ---------- + data : array_like + A 1-d endogenous response variable. The independent variable. + maxlag : int + The maximum lag to consider. + ic : {'aic', 'hqic', 'bic'}, default 'bic' + The information criterion to use in the selection. + + Returns + ------- + selected_order : int + The selected order. + + Notes + ----- + Only full models can be selected. + """ + + from statsmodels.tsa.ar_model import ar_select_order + + ar_lags = ar_select_order(data, maxlag=maxlag, ic=ic, old_names=False).ar_lags + + return ar_lags[-1] + + def _draw_auto_regression_correlated_np( *, intercept, coefs, covariance, n_samples, n_ts, seed, buffer ): From dfcf7894a8e13c3ac82d6f2a9f1dcff53edca6d6 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 1 Jul 2022 13:36:14 +0200 Subject: [PATCH 2/9] typo --- mesmer/calibrate_mesmer/train_gv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 4d182403..55cc08ad 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -172,7 +172,7 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): # create temporary DataArray data = xr.DataArray(gv[scen], dims=["run", "time"]) - AR_order = _select_ar_order_xr(data, "time", maxlag=max_lag, criterion=sel_crit) + AR_order = _select_ar_order_xr(data, "time", maxlag=max_lag, ic=sel_crit) # median over all ensemble members ("nearest" ensures an 'existing' lag is selected) AR_order = AR_order.quantile(q=0.5, method="nearest") From 29ad0fd99607f4754c2a25124afd4bfa5c999c9c Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 1 Jul 2022 14:02:56 +0200 Subject: [PATCH 3/9] fix backcompat --- mesmer/calibrate_mesmer/train_gv.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 55cc08ad..a158c571 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -14,6 +14,8 @@ import statsmodels.api as sm import xarray as xr +from packaging.version import Version + from mesmer.core.auto_regression import _fit_auto_regression_xr, _select_ar_order_xr @@ -165,6 +167,11 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): params_gv["max_lag"] = max_lag params_gv["sel_crit"] = sel_crit + if Version(xr.__version__) >= Version("2022.03.0"): + method = "method" + else: + method = "interpolation" + # select the AR Order AR_order_scen = list() for scen in gv.keys(): @@ -175,12 +182,12 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): AR_order = _select_ar_order_xr(data, "time", maxlag=max_lag, ic=sel_crit) # median over all ensemble members ("nearest" ensures an 'existing' lag is selected) - AR_order = AR_order.quantile(q=0.5, method="nearest") + AR_order = AR_order.quantile(q=0.5, **{method: "nearest"}) AR_order_scen.append(AR_order) # median over all scenarios AR_order_scen = xr.concat(AR_order_scen, dim="scen") - AR_order_sel = int(AR_order.quantile(q=0.5, method="nearest").item()) + AR_order_sel = int(AR_order.quantile(q=0.5, **{method: "nearest"}).item()) # determine the AR params for the selected AR order params_scen = list() From d293487c154e57c7a126dd3a6980e9b6f50d4b15 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 1 Jul 2022 14:26:55 +0200 Subject: [PATCH 4/9] linting --- mesmer/calibrate_mesmer/train_gv.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index a158c571..5f016723 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -13,7 +13,6 @@ import numpy as np import statsmodels.api as sm import xarray as xr - from packaging.version import Version from mesmer.core.auto_regression import _fit_auto_regression_xr, _select_ar_order_xr @@ -168,10 +167,10 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): params_gv["sel_crit"] = sel_crit if Version(xr.__version__) >= Version("2022.03.0"): - method = "method" - else: - method = "interpolation" - + method = "method" + else: + method = "interpolation" + # select the AR Order AR_order_scen = list() for scen in gv.keys(): From 3961c0d263274f0b87f6c1767898228348b92198 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 11 Jul 2022 15:28:17 +0200 Subject: [PATCH 5/9] update docstring --- mesmer/core/auto_regression.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py index c661d696..d830b000 100644 --- a/mesmer/core/auto_regression.py +++ b/mesmer/core/auto_regression.py @@ -7,8 +7,8 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): Parameters ---------- - data : array_like - A 1-d endogenous response variable. The independent variable. + data : DataArray + A ``xr.DataArray`` to estimate the auto regression order. dim : str Dimension along which to determine the order. maxlag : int @@ -24,7 +24,7 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): Notes ----- - Only full models can be selected. + Only full models can be selected along one dimension. """ selected_order = xr.apply_ufunc( @@ -49,7 +49,7 @@ def _select_order_np(data, maxlag, ic="bic"): Parameters ---------- data : array_like - A 1-d endogenous response variable. The independent variable. + A numpy array to estimate the auto regression order. Must be 1D. maxlag : int The maximum lag to consider. ic : {'aic', 'hqic', 'bic'}, default 'bic' @@ -62,7 +62,8 @@ def _select_order_np(data, maxlag, ic="bic"): Notes ----- - Only full models can be selected. + Thin wrapper around ``statsmodels.tsa.ar_model.ar_select_order``. Only full models + can be selected. """ from statsmodels.tsa.ar_model import ar_select_order From ae96eecbabcc29425faaf379fee361e8659928b6 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 11 Jul 2022 21:02:34 +0200 Subject: [PATCH 6/9] add tests --- tests/integration/test_auto_regression.py | 66 ++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_auto_regression.py b/tests/integration/test_auto_regression.py index 4869f3b3..d1d17e24 100644 --- a/tests/integration/test_auto_regression.py +++ b/tests/integration/test_auto_regression.py @@ -7,7 +7,71 @@ import mesmer.core.auto_regression from mesmer.core.utils import _check_dataarray_form, _check_dataset_form -from .utils import trend_data_1D, trend_data_2D +from .utils import trend_data_1D, trend_data_2D, trend_data_3D + + +def test_select_ar_order_xr_1d(): + + data = trend_data_1D() + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4) + + _check_dataarray_form(result, "selected_ar_order", ndim=0, shape=()) + + +@pytest.mark.parametrize("n_lon", [4, 5]) +@pytest.mark.parametrize("n_lat", [4, 5]) +def test_select_ar_order_xr_3d(n_lon, n_lat): + + data = trend_data_3D(n_lat=n_lat, n_lon=n_lon) + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 1) + + _check_dataarray_form( + result, + "selected_ar_order", + ndim=2, + required_dims={"lon", "lat"}, + shape=(n_lat, n_lon), + ) + + +def test_select_ar_order_xr_dim(): + + data = trend_data_3D(n_timesteps=4, n_lon=5) + result = mesmer.core.auto_regression._select_ar_order_xr(data, "lon", 1) + + _check_dataarray_form( + result, "selected_ar_order", ndim=2, required_dims={"time", "lat"}, shape=(4, 3) + ) + + +def test_select_ar_order_xr(): + + data = trend_data_2D() + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4) + + coords = data.drop_vars("time").coords + + expected = xr.DataArray([4.0, 2.0, 2.0, 3.0, 4.0, 2.0], dims="cells", coords=coords) + + xr.testing.assert_equal(result, expected) + + +def test_select_ar_order_np(): + + rng = np.random.default_rng(seed=0) + data = rng.normal(size=100) + + result = mesmer.core.auto_regression._select_ar_order_np(data, 2) + assert np.isnan(result) + + result = mesmer.core.auto_regression._select_ar_order_np(data[:10], 2) + assert result == 2 + + with pytest.raises(ValueError): + mesmer.core.auto_regression._select_ar_order_np(data[:6], 5) @pytest.mark.parametrize("ar_order", [1, 8]) From a1de22d0fbd00e5924494d44788ccb9aa373f693 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 11 Jul 2022 21:02:44 +0200 Subject: [PATCH 7/9] update code --- mesmer/core/__init__.py | 2 +- mesmer/core/auto_regression.py | 23 ++++++++++++++--------- tests/integration/utils.py | 15 +++++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/mesmer/core/__init__.py b/mesmer/core/__init__.py index 73a473fa..8bb45099 100644 --- a/mesmer/core/__init__.py +++ b/mesmer/core/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from . import linear_regression +from . import auto_regression, linear_regression diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py index d830b000..12c50414 100644 --- a/mesmer/core/auto_regression.py +++ b/mesmer/core/auto_regression.py @@ -18,7 +18,7 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): Returns ------- - selected_order : DataArray + selected_ar_order : DataArray Array indicating the selected order with the same size as the input but ``dim`` removed. @@ -27,23 +27,25 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): Only full models can be selected along one dimension. """ - selected_order = xr.apply_ufunc( - _select_order_np, + selected_ar_order = xr.apply_ufunc( + _select_ar_order_np, data, input_core_dims=[[dim]], output_core_dims=((),), vectorize=True, - output_dtypes=[int], + output_dtypes=[float], kwargs={"maxlag": maxlag, "ic": ic}, ) # remove zeros - selected_order[selected_order == 0] = np.NaN + selected_ar_order.data[selected_ar_order.data == 0] = np.NaN - return selected_order + selected_ar_order.name = "selected_ar_order" + return selected_ar_order -def _select_order_np(data, maxlag, ic="bic"): + +def _select_ar_order_np(data, maxlag, ic="bic"): """Select the order of an autoregressive AR-X(p) process - numpy wrapper Parameters @@ -57,7 +59,7 @@ def _select_order_np(data, maxlag, ic="bic"): Returns ------- - selected_order : int + selected_ar_order : int The selected order. Notes @@ -70,7 +72,10 @@ def _select_order_np(data, maxlag, ic="bic"): ar_lags = ar_select_order(data, maxlag=maxlag, ic=ic, old_names=False).ar_lags - return ar_lags[-1] + # None is returned if no lag is selected + selected_ar_order = np.NaN if ar_lags is None else ar_lags[-1] + + return selected_ar_order def _draw_auto_regression_correlated_np( diff --git a/tests/integration/utils.py b/tests/integration/utils.py index d7d46806..963207b9 100644 --- a/tests/integration/utils.py +++ b/tests/integration/utils.py @@ -59,3 +59,18 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale= } return xr.DataArray(data, dims=("cells", "time"), coords=coords) + + +def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1): + + data = trend_data_2D( + n_timesteps=n_timesteps, + n_lat=n_lat, + n_lon=n_lon, + intercept=intercept, + slope=slope, + scale=scale, + ) + + # reshape to 3D (time x lat x lon) + return data.set_index(cells=("lat", "lon")).unstack("cells") From 217e0feca617c8617fe4bb9fedd31bc3ea50c143 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 11 Jul 2022 21:06:37 +0200 Subject: [PATCH 8/9] update docs --- CHANGELOG.rst | 3 +++ docs/source/api.rst | 1 + 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 84f1eb59..04596755 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -23,6 +23,9 @@ New Features - Add ``mesmer.core.auto_regression._draw_auto_regression_correlated_np``: to draw samples of an auto regression model (`#161 `_). By `Mathias Hauser `_. +- Extract function to select the order of the auto regressive model: ``mesmer.core.auto_regression._select_ar_order_xr`` + (`#176 `_). + By `Mathias Hauser `_. Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/docs/source/api.rst b/docs/source/api.rst index 009dca95..9dc83d48 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -22,6 +22,7 @@ Statistical core functions ~core.linear_regression.LinearRegression.residuals ~core.linear_regression.LinearRegression.to_netcdf ~core.linear_regression.LinearRegression.from_netcdf + ~core.auto_regression._select_ar_order_xr ~core.auto_regression._fit_auto_regression_xr ~core.auto_regression._draw_auto_regression_correlated_np From 6501d99417c8291f8055768cf1ebf995af2cbd8b Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 11 Jul 2022 21:14:32 +0200 Subject: [PATCH 9/9] Apply suggestions from code review --- mesmer/calibrate_mesmer/train_gv.py | 2 +- mesmer/core/auto_regression.py | 4 ++-- tests/integration/test_auto_regression.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 5f016723..f9d8d3d5 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -178,7 +178,7 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): # create temporary DataArray data = xr.DataArray(gv[scen], dims=["run", "time"]) - AR_order = _select_ar_order_xr(data, "time", maxlag=max_lag, ic=sel_crit) + AR_order = _select_ar_order_xr(data, dim="time", maxlag=max_lag, ic=sel_crit) # median over all ensemble members ("nearest" ensures an 'existing' lag is selected) AR_order = AR_order.quantile(q=0.5, **{method: "nearest"}) diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py index 12c50414..05c7f5d2 100644 --- a/mesmer/core/auto_regression.py +++ b/mesmer/core/auto_regression.py @@ -3,7 +3,7 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): - """Select the order of an autoregressive AR-X(p) process - xarray wrapper + """Select the order of an autoregressive process - xarray wrapper Parameters ---------- @@ -46,7 +46,7 @@ def _select_ar_order_xr(data, dim, maxlag, ic="bic"): def _select_ar_order_np(data, maxlag, ic="bic"): - """Select the order of an autoregressive AR-X(p) process - numpy wrapper + """Select the order of an autoregressive AR(p) process - numpy wrapper Parameters ---------- diff --git a/tests/integration/test_auto_regression.py b/tests/integration/test_auto_regression.py index d1d17e24..444cf9d4 100644 --- a/tests/integration/test_auto_regression.py +++ b/tests/integration/test_auto_regression.py @@ -19,8 +19,8 @@ def test_select_ar_order_xr_1d(): _check_dataarray_form(result, "selected_ar_order", ndim=0, shape=()) -@pytest.mark.parametrize("n_lon", [4, 5]) -@pytest.mark.parametrize("n_lat", [4, 5]) +@pytest.mark.parametrize("n_lon", [1, 2]) +@pytest.mark.parametrize("n_lat", [3, 4]) def test_select_ar_order_xr_3d(n_lon, n_lat): data = trend_data_3D(n_lat=n_lat, n_lon=n_lon)