Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tag all_years: True #1122

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions doc/recipe/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,27 @@ data specifications:
``RCP8.5``)
- mip (for CMIP data, key ``mip``, value e.g. ``Amon``, ``Omon``, ``LImon``)
- ensemble member (key ``ensemble``, value e.g. ``r1i1p1``, ``r1i1p1f1``)
- sub-experiment id (key `sub_experiment`, value e.g. `s2000`, `s(2000:2002)`,
for DCPP data only)
- time range (e.g. key-value ``start_year: 1982``, ``end_year: 1990``. Please
note that `yaml`_ interprets numbers with a leading ``0`` as octal numbers,
so we recommend to avoid them. For example, use ``128`` to specify the year
128 instead of ``0128``.)
- model grid (native grid ``grid: gn`` or regridded grid ``grid: gr``, for
CMIP6 data only).
- load all years (key-value ``all_years: True`` to load all the years available in
a particular dataset)

For example, a datasets section could be:

.. code-block:: yaml

datasets:
- {dataset: CanESM2, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004}
- {dataset: ACCESS1-0, project: CMIP5, exp: historical, ensemble: r1i1p1, all_years: True}
- {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, start_year: 2001, end_year: 2004, grid: gn}
- {dataset: EC-EARTH3, alias: custom_alias, project: CMIP6, exp: historical, ensemble: r1i1p1f1, start_year: 2001, end_year: 2004, grid: gn}
- {dataset: HadGEM3-GC31-MM, alias: custom_alias, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s2000, grid: gn, start_year: 2000, end_year, 2002}

It is possible to define the experiment as a list to concatenate two experiments.
Here it is an example concatenating the `historical` experiment with `rcp85`
Expand All @@ -114,9 +120,9 @@ In this case, the specified datasets are concatenated into a single cube:
datasets:
- {dataset: CanESM2, project: CMIP5, exp: [historical, rcp85], ensemble: [r1i1p1, r1i2p1], start_year: 2001, end_year: 2004}

ESMValTool also supports a simplified syntax to add multiple ensemble members from the same dataset.
ESMValTool also supports a simplified syntax to add multiple ensemble members.
In the ensemble key, any element in the form `(x:y)` will be replaced with all numbers from x to y (both inclusive),
adding a dataset entry for each replacement. For example, to add ensemble members r1i1p1 to r10i1p1
adding a dataset entry for each replacement. For example, to add ensemble members r1i1p1 to r10i1p1
you can use the following abbreviated syntax:

.. code-block:: yaml
Expand All @@ -136,6 +142,15 @@ Please, bear in mind that this syntax can only be used in the ensemble tag.
Also, note that the combination of multiple experiments and ensembles, like
exp: [historical, rcp85], ensemble: [r1i1p1, "r(2:3)i1p1"] is not supported and will raise an error.

The same simplified syntax can be used to add multiple sub-experiment ids, as well as in combination with the ``all_years: True`` tag:

.. code-block:: yaml

datasets:
- {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: "s(2000:2002)", grid: gn, start_year: 2003, end_year: 2004}
- {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: "s(1980:1990)", grid: gn, all_years: True}


Note that this section is not required, as datasets can also be provided in the
Diagnostics_ section.

Expand Down
32 changes: 26 additions & 6 deletions esmvalcore/_data_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,21 @@ def select_files(filenames, start_year, end_year):
def _replace_tags(paths, variable):
"""Replace tags in the config-developer's file with actual values."""
if isinstance(paths, str):
paths = (paths.strip('/'), )
paths = set((paths.strip('/'),))
else:
paths = [path.strip('/') for path in paths]
paths = set(path.strip('/') for path in paths)
tlist = set()

for path in paths:
tlist = tlist.union(re.findall(r'{([^}]*)}', path))
if 'sub_experiment' in variable:
new_paths = []
for path in paths:
new_paths.extend((
re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path),
re.sub(r'({ensemble})', r'{sub_experiment}-\1', path)
))
tlist.add('sub_experiment')
paths = new_paths
logger.debug(tlist)

for tag in tlist:
Expand All @@ -112,7 +120,6 @@ def _replace_tags(paths, variable):
else:
raise KeyError("Dataset key {} must be specified for {}, check "
"your recipe entry".format(tag, variable))

paths = _replace_tag(paths, original_tag, replacewith)
return paths

Expand All @@ -127,7 +134,7 @@ def _replace_tag(paths, tag, replacewith):
else:
text = _apply_caps(str(replacewith), lower, upper)
result.extend(p.replace('{' + tag + '}', text) for p in paths)
return result
return list(set(result))


def _get_caps_options(tag):
Expand Down Expand Up @@ -229,6 +236,17 @@ def _get_filenames_glob(variable, drs):
return filenames_glob


def _update_output_file(variable, files):
intervals = [get_start_end_year(name) for name in files]
variable.update({'start_year': min(intervals)[0]})
variable.update({'end_year': max(intervals)[1]})
filename = variable['filename'].replace(
'.nc', '_{start_year}-{end_year}.nc'.format(**variable)
)
variable['filename'] = filename
return variable


def _find_input_files(variable, rootpath, drs):
short_name = variable['short_name']
variable['short_name'] = variable['original_short_name']
Expand All @@ -248,6 +266,8 @@ def get_input_filelist(variable, rootpath, drs):
(files, dirnames, filenames) = _find_input_files(variable, rootpath, drs)
# do time gating only for non-fx variables
if variable['frequency'] != 'fx':
if 'all_years' in variable:
variable = _update_output_file(variable, files)
files = select_files(files, variable['start_year'],
variable['end_year'])
return (files, dirnames, filenames)
Expand All @@ -268,7 +288,7 @@ def get_output_file(variable, preproc_dir):
variable['variable_group'],
_replace_tags(cfg['output_file'], variable)[0],
)
if variable['frequency'] != 'fx':
if variable['frequency'] != 'fx' and 'all_years' not in variable:
outfile += '_{start_year}-{end_year}'.format(**variable)
outfile += '.nc'
return outfile
Expand Down
41 changes: 26 additions & 15 deletions esmvalcore/_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,37 +1027,38 @@ def _initialize_datasets(raw_datasets):
return datasets

@staticmethod
def _expand_ensemble(variables):
"""Expand ensemble members to multiple datasets.
def _expand_tag(variables, input_tag):
"""
Expand tags such as ensemble members or stardates to multiple datasets.

Expansion only supports ensembles defined as strings, not lists.
"""
expanded = []
regex = re.compile(r'\(\d+:\d+\)')

def expand_ensemble(variable):
ens = variable.get('ensemble', "")
match = regex.search(ens)
def expand_tag(variable, input_tag):
tag = variable.get(input_tag, "")
match = regex.search(tag)
if match:
start, end = match.group(0)[1:-1].split(':')
for i in range(int(start), int(end) + 1):
expand = deepcopy(variable)
expand['ensemble'] = regex.sub(str(i), ens, 1)
expand_ensemble(expand)
expand[input_tag] = regex.sub(str(i), tag, 1)
expand_tag(expand, input_tag)
else:
expanded.append(variable)

for variable in variables:
ensemble = variable.get('ensemble', "")
if isinstance(ensemble, (list, tuple)):
for elem in ensemble:
tag = variable.get(input_tag, "")
if isinstance(tag, (list, tuple)):
for elem in tag:
if regex.search(elem):
raise RecipeError(
f"In variable {variable}: ensemble expansion "
"cannot be combined with ensemble lists")
f"In variable {variable}: {input_tag} expansion "
f"cannot be combined with {input_tag} lists")
expanded.append(variable)
else:
expand_ensemble(variable)
expand_tag(variable, input_tag)

return expanded

Expand Down Expand Up @@ -1104,8 +1105,18 @@ def _initialize_variables(self, raw_variable, raw_datasets):
activity = get_activity(variable)
if activity:
variable['activity'] = activity
check.variable(variable, required_keys)
variables = self._expand_ensemble(variables)
if 'all_years' in variable:
if variable['all_years']:
required_keys.discard('start_year')
required_keys.discard('end_year')
if 'sub_experiment' in variable:
subexperiment_keys = deepcopy(required_keys)
subexperiment_keys.update({'sub_experiment'})
check.variable(variable, subexperiment_keys)
else:
check.variable(variable, required_keys)
variables = self._expand_tag(variables, 'ensemble')
variables = self._expand_tag(variables, 'sub_experiment')
return variables

def _initialize_preprocessor_output(self, diagnostic_name, raw_variables,
Expand Down
2 changes: 1 addition & 1 deletion esmvalcore/_recipe_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def data_availability(input_files, var, dirnames, filenames):
f"Missing data for {var['alias']}: {var['short_name']}")

# check time avail only for non-fx variables
if var['frequency'] == 'fx':
if var['frequency'] == 'fx' or 'all_years' in var:
return

required_years = set(range(var['start_year'], var['end_year'] + 1))
Expand Down
2 changes: 1 addition & 1 deletion esmvalcore/recipe_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ variable:
alternative_dataset: str(required=False)
fx_files: list(required=False)
additional_datasets: list(include('dataset'), required=False)

all_years: bool(required=False)
# TODO: add preprocessor item

diagnostic:
Expand Down
54 changes: 54 additions & 0 deletions tests/integration/test_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,60 @@ def test_empty_variable(tmp_path, patched_datafinder, config_user):
assert product.attributes['dataset'] == 'CanESM2'


def test_all_years_tag(tmp_path, patched_datafinder, config_user):
"""Test all_years tag for time-dependent variables."""
content = dedent("""
diagnostics:
diagnostic_name:
additional_datasets:
- dataset: CanESM2
project: CMIP5
mip: Amon
exp: historical
all_years: True
ensemble: r1i1p1
variables:
pr:
scripts: null
""")

recipe = get_recipe(tmp_path, content, config_user)
assert len(recipe.tasks) == 1
task = recipe.tasks.pop()
assert len(task.products) == 1
product = task.products.pop()
assert product.attributes['short_name'] == 'pr'
assert product.attributes['dataset'] == 'CanESM2'
assert '1990-2019' in product.filename


def test_fx_all_years_tag(tmp_path, patched_datafinder, config_user):
"""Test all_years tag does not break time-independent variables."""
content = dedent("""
diagnostics:
diagnostic_name:
additional_datasets:
- dataset: CanESM2
project: CMIP5
mip: fx
exp: historical
all_years: True
ensemble: r1i1p1
variables:
areacella:
scripts: null
""")

recipe = get_recipe(tmp_path, content, config_user)
assert len(recipe.tasks) == 1
task = recipe.tasks.pop()
assert len(task.products) == 1
product = task.products.pop()
assert product.attributes['short_name'] == 'areacella'
assert product.attributes['dataset'] == 'CanESM2'
assert '1990-2019' not in product.filename


def test_cmip3_variable_autocomplete(tmp_path, patched_datafinder,
config_user):
"""Test that required information is automatically added for CMIP5."""
Expand Down
65 changes: 54 additions & 11 deletions tests/unit/data_finder/test_replace_tags.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,65 @@
"""Tests for _replace_tags in _data_finder.py."""

from esmvalcore._data_finder import _replace_tags

VARIABLE = {
'project': 'CMIP6',
'dataset': 'ACCURATE-MODEL',
'activity': 'act',
'exp': 'experiment',
'institute': 'HMA',
'ensemble': 'r1i1p1f1',
'mip': 'Amon',
'short_name': 'tas',
'grid': 'gr',
}


def test_replace_tags_str():
assert _replace_tags('folder/subfolder/{short_name}',
VARIABLE) == ['folder/subfolder/tas']
def test_replace_tags():
"""Tests for get_start_end_year function."""
path = _replace_tags(
'{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
'{grid}/{latestversion}', VARIABLE)
input_file = _replace_tags(
'{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', VARIABLE)
output_file = _replace_tags(
'{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', VARIABLE)
assert path == [
'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
'{latestversion}'
]
assert input_file == ['tas_Amon_ACCURATE-MODEL_experiment_r1i1p1f1_gr*.nc']
assert output_file == ['CMIP6_ACCURATE-MODEL_Amon_experiment_r1i1p1f1_tas']


def test_replace_tags_list_of_str():
assert _replace_tags(('folder/subfolder/{short_name}',
'folder2/{short_name}', 'subfolder/{short_name}'),
VARIABLE) == [
'folder/subfolder/tas',
'folder2/tas',
'subfolder/tas',
]
assert sorted(
_replace_tags(('folder/subfolder/{short_name}', 'folder2/{short_name}',
'subfolder/{short_name}'), VARIABLE)) == sorted([
'folder2/tas',
'folder/subfolder/tas',
'subfolder/tas',
])


def test_replace_tags_with_subexperiment():
"""Tests for get_start_end_year function."""
variable = {'sub_experiment': '199411', **VARIABLE}
path = _replace_tags(
'{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
'{grid}/{latestversion}', variable)
input_file = _replace_tags(
'{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', variable)
output_file = _replace_tags(
'{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', variable)
assert sorted(path) == sorted([
'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
'{latestversion}',
'act/HMA/ACCURATE-MODEL/experiment/199411-r1i1p1f1/Amon/tas/gr/'
'{latestversion}'
])
assert input_file == [
'tas_Amon_ACCURATE-MODEL_experiment_199411-r1i1p1f1_gr*.nc'
]
assert output_file == [
'CMIP6_ACCURATE-MODEL_Amon_experiment_199411-r1i1p1f1_tas'
]
Loading