diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 84f1eb59..04596755 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -23,6 +23,9 @@ New Features - Add ``mesmer.core.auto_regression._draw_auto_regression_correlated_np``: to draw samples of an auto regression model (`#161 `_). By `Mathias Hauser `_. +- Extract function to select the order of the auto regressive model: ``mesmer.core.auto_regression._select_ar_order_xr`` + (`#176 `_). + By `Mathias Hauser `_. Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/docs/source/api.rst b/docs/source/api.rst index 009dca95..9dc83d48 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -22,6 +22,7 @@ Statistical core functions ~core.linear_regression.LinearRegression.residuals ~core.linear_regression.LinearRegression.to_netcdf ~core.linear_regression.LinearRegression.from_netcdf + ~core.auto_regression._select_ar_order_xr ~core.auto_regression._fit_auto_regression_xr ~core.auto_regression._draw_auto_regression_correlated_np diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 65235349..f9d8d3d5 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -14,9 +14,8 @@ import statsmodels.api as sm import xarray as xr from packaging.version import Version -from statsmodels.tsa.ar_model import ar_select_order -from mesmer.core.auto_regression import _fit_auto_regression_xr +from mesmer.core.auto_regression import _fit_auto_regression_xr, _select_ar_order_xr def train_gv(gv, targ, esm, cfg, save_params=True, **kwargs): @@ -167,33 +166,27 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): params_gv["max_lag"] = max_lag params_gv["sel_crit"] = sel_crit + if Version(xr.__version__) >= Version("2022.03.0"): + method = "method" + else: + method = "interpolation" + # select the AR Order - nr_scens = len(gv.keys()) - AR_order_scens_tmp = np.zeros(nr_scens) + AR_order_scen = list() + for scen in gv.keys(): - # np.percentile renamed the keyword in numpy v1.22 - if Version(np.__version__) >= Version("1.22.0"): - pct_kwargs = {"q": 50, "method": "nearest"} - else: - pct_kwargs = {"q": 50, "interpolation": "nearest"} + # create temporary DataArray + data = xr.DataArray(gv[scen], dims=["run", "time"]) - for scen_idx, scen in enumerate(gv.keys()): - nr_runs = gv[scen].shape[0] - AR_order_runs_tmp = np.zeros(nr_runs) - - for run in np.arange(nr_runs): - run_ar_lags = ar_select_order( - gv[scen][run], maxlag=max_lag, ic=sel_crit, old_names=False - ).ar_lags - # if order > 0 is selected,add selected order to vector - if len(run_ar_lags) > 0: - AR_order_runs_tmp[run] = run_ar_lags[-1] - - # interpolation is not a good way to go here because it could lead to an AR - # order that wasn't chosen by run -> avoid it by just taking nearest - AR_order_scens_tmp[scen_idx] = np.percentile(AR_order_runs_tmp, **pct_kwargs) - - AR_order_sel = int(np.percentile(AR_order_scens_tmp, **pct_kwargs)) + AR_order = _select_ar_order_xr(data, dim="time", maxlag=max_lag, ic=sel_crit) + + # median over all ensemble members ("nearest" ensures an 'existing' lag is selected) + AR_order = AR_order.quantile(q=0.5, **{method: "nearest"}) + AR_order_scen.append(AR_order) + + # median over all scenarios + AR_order_scen = xr.concat(AR_order_scen, dim="scen") + AR_order_sel = int(AR_order.quantile(q=0.5, **{method: "nearest"}).item()) # determine the AR params for the selected AR order params_scen = list() diff --git a/mesmer/core/__init__.py b/mesmer/core/__init__.py index 73a473fa..8bb45099 100644 --- a/mesmer/core/__init__.py +++ b/mesmer/core/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from . import linear_regression +from . import auto_regression, linear_regression diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py index 222e9043..05c7f5d2 100644 --- a/mesmer/core/auto_regression.py +++ b/mesmer/core/auto_regression.py @@ -2,6 +2,82 @@ import xarray as xr +def _select_ar_order_xr(data, dim, maxlag, ic="bic"): + """Select the order of an autoregressive process - xarray wrapper + + Parameters + ---------- + data : DataArray + A ``xr.DataArray`` to estimate the auto regression order. + dim : str + Dimension along which to determine the order. + maxlag : int + The maximum lag to consider. + ic : {'aic', 'hqic', 'bic'}, default 'bic' + The information criterion to use in the selection. + + Returns + ------- + selected_ar_order : DataArray + Array indicating the selected order with the same size as the input but ``dim`` + removed. + + Notes + ----- + Only full models can be selected along one dimension. + """ + + selected_ar_order = xr.apply_ufunc( + _select_ar_order_np, + data, + input_core_dims=[[dim]], + output_core_dims=((),), + vectorize=True, + output_dtypes=[float], + kwargs={"maxlag": maxlag, "ic": ic}, + ) + + # remove zeros + selected_ar_order.data[selected_ar_order.data == 0] = np.NaN + + selected_ar_order.name = "selected_ar_order" + + return selected_ar_order + + +def _select_ar_order_np(data, maxlag, ic="bic"): + """Select the order of an autoregressive AR(p) process - numpy wrapper + + Parameters + ---------- + data : array_like + A numpy array to estimate the auto regression order. Must be 1D. + maxlag : int + The maximum lag to consider. + ic : {'aic', 'hqic', 'bic'}, default 'bic' + The information criterion to use in the selection. + + Returns + ------- + selected_ar_order : int + The selected order. + + Notes + ----- + Thin wrapper around ``statsmodels.tsa.ar_model.ar_select_order``. Only full models + can be selected. + """ + + from statsmodels.tsa.ar_model import ar_select_order + + ar_lags = ar_select_order(data, maxlag=maxlag, ic=ic, old_names=False).ar_lags + + # None is returned if no lag is selected + selected_ar_order = np.NaN if ar_lags is None else ar_lags[-1] + + return selected_ar_order + + def _draw_auto_regression_correlated_np( *, intercept, coefs, covariance, n_samples, n_ts, seed, buffer ): diff --git a/tests/integration/test_auto_regression.py b/tests/integration/test_auto_regression.py index 4869f3b3..444cf9d4 100644 --- a/tests/integration/test_auto_regression.py +++ b/tests/integration/test_auto_regression.py @@ -7,7 +7,71 @@ import mesmer.core.auto_regression from mesmer.core.utils import _check_dataarray_form, _check_dataset_form -from .utils import trend_data_1D, trend_data_2D +from .utils import trend_data_1D, trend_data_2D, trend_data_3D + + +def test_select_ar_order_xr_1d(): + + data = trend_data_1D() + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4) + + _check_dataarray_form(result, "selected_ar_order", ndim=0, shape=()) + + +@pytest.mark.parametrize("n_lon", [1, 2]) +@pytest.mark.parametrize("n_lat", [3, 4]) +def test_select_ar_order_xr_3d(n_lon, n_lat): + + data = trend_data_3D(n_lat=n_lat, n_lon=n_lon) + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 1) + + _check_dataarray_form( + result, + "selected_ar_order", + ndim=2, + required_dims={"lon", "lat"}, + shape=(n_lat, n_lon), + ) + + +def test_select_ar_order_xr_dim(): + + data = trend_data_3D(n_timesteps=4, n_lon=5) + result = mesmer.core.auto_regression._select_ar_order_xr(data, "lon", 1) + + _check_dataarray_form( + result, "selected_ar_order", ndim=2, required_dims={"time", "lat"}, shape=(4, 3) + ) + + +def test_select_ar_order_xr(): + + data = trend_data_2D() + + result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4) + + coords = data.drop_vars("time").coords + + expected = xr.DataArray([4.0, 2.0, 2.0, 3.0, 4.0, 2.0], dims="cells", coords=coords) + + xr.testing.assert_equal(result, expected) + + +def test_select_ar_order_np(): + + rng = np.random.default_rng(seed=0) + data = rng.normal(size=100) + + result = mesmer.core.auto_regression._select_ar_order_np(data, 2) + assert np.isnan(result) + + result = mesmer.core.auto_regression._select_ar_order_np(data[:10], 2) + assert result == 2 + + with pytest.raises(ValueError): + mesmer.core.auto_regression._select_ar_order_np(data[:6], 5) @pytest.mark.parametrize("ar_order", [1, 8]) diff --git a/tests/integration/utils.py b/tests/integration/utils.py index d7d46806..963207b9 100644 --- a/tests/integration/utils.py +++ b/tests/integration/utils.py @@ -59,3 +59,18 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale= } return xr.DataArray(data, dims=("cells", "time"), coords=coords) + + +def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1): + + data = trend_data_2D( + n_timesteps=n_timesteps, + n_lat=n_lat, + n_lon=n_lon, + intercept=intercept, + slope=slope, + scale=scale, + ) + + # reshape to 3D (time x lat x lon) + return data.set_index(cells=("lat", "lon")).unstack("cells")