ESMValGroup · sloosvel · May 11, 2020 · May 12, 2020 · May 13, 2020 · May 13, 2020
diff --git a/doc/recipe/overview.rst b/doc/recipe/overview.rst
@@ -82,21 +82,27 @@ data specifications:
   ``RCP8.5``)
 - mip (for CMIP data, key ``mip``, value e.g. ``Amon``, ``Omon``, ``LImon``)
 - ensemble member (key ``ensemble``, value e.g. ``r1i1p1``, ``r1i1p1f1``)
+- sub-experiment id (key `sub_experiment`, value e.g. `s2000`, `s(2000:2002)`, 
+  for DCPP data only)
 - time range (e.g. key-value ``start_year: 1982``, ``end_year: 1990``. Please
   note that `yaml`_ interprets numbers with a leading ``0`` as octal numbers,
   so we recommend to avoid them. For example, use ``128`` to specify the year
   128 instead of ``0128``.)
 - model grid (native grid ``grid: gn`` or regridded grid ``grid: gr``, for
   CMIP6 data only).
+- load all years (key-value ``all_years: True`` to load all the years available in 
+  a particular dataset)
 
 For example, a datasets section could be:
 
 .. code-block:: yaml
 
     datasets:
       - {dataset: CanESM2, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004}
+      - {dataset: ACCESS1-0, project: CMIP5, exp: historical, ensemble: r1i1p1, all_years: True}
       - {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, start_year: 2001, end_year: 2004, grid: gn}
       - {dataset: EC-EARTH3, alias: custom_alias, project: CMIP6, exp: historical, ensemble: r1i1p1f1, start_year: 2001, end_year: 2004, grid: gn}
+      - {dataset: HadGEM3-GC31-MM, alias: custom_alias, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s2000, grid: gn, start_year: 2000, end_year, 2002}
 
 It is possible to define the experiment as a list to concatenate two experiments.
 Here it is an example concatenating the `historical` experiment with `rcp85`
@@ -114,9 +120,9 @@ In this case, the specified datasets are concatenated into a single cube:
     datasets:
       - {dataset: CanESM2, project: CMIP5, exp: [historical, rcp85], ensemble: [r1i1p1, r1i2p1], start_year: 2001, end_year: 2004}
 
-ESMValTool also supports a simplified syntax to add multiple ensemble members from the same dataset.
+ESMValTool also supports a simplified syntax to add multiple ensemble members.
 In the ensemble key, any element in the form `(x:y)` will be replaced with all numbers from x to y (both inclusive),
-adding a dataset entry for each replacement. For example, to add ensemble members r1i1p1 to r10i1p1
+adding a dataset entry for each replacement. For example, to add ensemble members r1i1p1 to r10i1p1 
 you can use the following abbreviated syntax:
 
 .. code-block:: yaml
@@ -136,6 +142,15 @@ Please, bear in mind that this syntax can only be used in the ensemble tag.
 Also, note that the combination of multiple experiments and ensembles, like
 exp: [historical, rcp85], ensemble: [r1i1p1, "r(2:3)i1p1"] is not supported and will raise an error.
 
+The same simplified syntax can be used to add multiple sub-experiment ids, as well as in combination with the ``all_years: True`` tag:
+
+.. code-block:: yaml
+
+    datasets:
+      - {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: "s(2000:2002)", grid: gn, start_year: 2003, end_year: 2004}
+      - {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: "s(1980:1990)", grid: gn, all_years: True}
+
+
 Note that this section is not required, as datasets can also be provided in the
 Diagnostics_ section.
 

diff --git a/esmvalcore/_data_finder.py b/esmvalcore/_data_finder.py
@@ -92,13 +92,21 @@ def select_files(filenames, start_year, end_year):
 def _replace_tags(paths, variable):
     """Replace tags in the config-developer's file with actual values."""
     if isinstance(paths, str):
-        paths = (paths.strip('/'), )
+        paths = set((paths.strip('/'),))
     else:
-        paths = [path.strip('/') for path in paths]
+        paths = set(path.strip('/') for path in paths)
     tlist = set()
-
     for path in paths:
         tlist = tlist.union(re.findall(r'{([^}]*)}', path))
+    if 'sub_experiment' in variable:
+        new_paths = []
+        for path in paths:
+            new_paths.extend((
+                re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path),
+                re.sub(r'({ensemble})', r'{sub_experiment}-\1', path)
+            ))
+            tlist.add('sub_experiment')
+        paths = new_paths
     logger.debug(tlist)
 
     for tag in tlist:
@@ -112,7 +120,6 @@ def _replace_tags(paths, variable):
         else:
             raise KeyError("Dataset key {} must be specified for {}, check "
                            "your recipe entry".format(tag, variable))
-
         paths = _replace_tag(paths, original_tag, replacewith)
     return paths
 
@@ -127,7 +134,7 @@ def _replace_tag(paths, tag, replacewith):
     else:
         text = _apply_caps(str(replacewith), lower, upper)
         result.extend(p.replace('{' + tag + '}', text) for p in paths)
-    return result
+    return list(set(result))
 
 
 def _get_caps_options(tag):
@@ -229,6 +236,17 @@ def _get_filenames_glob(variable, drs):
     return filenames_glob
 
 
+def _update_output_file(variable, files):
+    intervals = [get_start_end_year(name) for name in files]
+    variable.update({'start_year': min(intervals)[0]})
+    variable.update({'end_year': max(intervals)[1]})
+    filename = variable['filename'].replace(
+        '.nc', '_{start_year}-{end_year}.nc'.format(**variable)
+    )
+    variable['filename'] = filename
+    return variable
+
+
 def _find_input_files(variable, rootpath, drs):
     short_name = variable['short_name']
     variable['short_name'] = variable['original_short_name']
@@ -248,6 +266,8 @@ def get_input_filelist(variable, rootpath, drs):
     (files, dirnames, filenames) = _find_input_files(variable, rootpath, drs)
     # do time gating only for non-fx variables
     if variable['frequency'] != 'fx':
+        if 'all_years' in variable:
+            variable = _update_output_file(variable, files)
         files = select_files(files, variable['start_year'],
                              variable['end_year'])
     return (files, dirnames, filenames)
@@ -268,7 +288,7 @@ def get_output_file(variable, preproc_dir):
         variable['variable_group'],
         _replace_tags(cfg['output_file'], variable)[0],
     )
-    if variable['frequency'] != 'fx':
+    if variable['frequency'] != 'fx' and 'all_years' not in variable:
         outfile += '_{start_year}-{end_year}'.format(**variable)
     outfile += '.nc'
     return outfile

diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py
@@ -1027,37 +1027,38 @@ def _initialize_datasets(raw_datasets):
         return datasets
 
     @staticmethod
-    def _expand_ensemble(variables):
-        """Expand ensemble members to multiple datasets.
+    def _expand_tag(variables, input_tag):
+        """
+        Expand tags such as ensemble members or stardates to multiple datasets.
 
         Expansion only supports ensembles defined as strings, not lists.
         """
         expanded = []
         regex = re.compile(r'\(\d+:\d+\)')
 
-        def expand_ensemble(variable):
-            ens = variable.get('ensemble', "")
-            match = regex.search(ens)
+        def expand_tag(variable, input_tag):
+            tag = variable.get(input_tag, "")
+            match = regex.search(tag)
             if match:
                 start, end = match.group(0)[1:-1].split(':')
                 for i in range(int(start), int(end) + 1):
                     expand = deepcopy(variable)
-                    expand['ensemble'] = regex.sub(str(i), ens, 1)
-                    expand_ensemble(expand)
+                    expand[input_tag] = regex.sub(str(i), tag, 1)
+                    expand_tag(expand, input_tag)
             else:
                 expanded.append(variable)
 
         for variable in variables:
-            ensemble = variable.get('ensemble', "")
-            if isinstance(ensemble, (list, tuple)):
-                for elem in ensemble:
+            tag = variable.get(input_tag, "")
+            if isinstance(tag, (list, tuple)):
+                for elem in tag:
                     if regex.search(elem):
                         raise RecipeError(
-                            f"In variable {variable}: ensemble expansion "
-                            "cannot be combined with ensemble lists")
+                            f"In variable {variable}: {input_tag} expansion "
+                            f"cannot be combined with {input_tag} lists")
                 expanded.append(variable)
             else:
-                expand_ensemble(variable)
+                expand_tag(variable, input_tag)
 
         return expanded
 
@@ -1104,8 +1105,18 @@ def _initialize_variables(self, raw_variable, raw_datasets):
                 activity = get_activity(variable)
                 if activity:
                     variable['activity'] = activity
-            check.variable(variable, required_keys)
-        variables = self._expand_ensemble(variables)
+            if 'all_years' in variable:
+                if variable['all_years']:
+                    required_keys.discard('start_year')
+                    required_keys.discard('end_year')
+            if 'sub_experiment' in variable:
+                subexperiment_keys = deepcopy(required_keys)
+                subexperiment_keys.update({'sub_experiment'})
+                check.variable(variable, subexperiment_keys)
+            else:
+                check.variable(variable, required_keys)
+        variables = self._expand_tag(variables, 'ensemble')
+        variables = self._expand_tag(variables, 'sub_experiment')
         return variables
 
     def _initialize_preprocessor_output(self, diagnostic_name, raw_variables,

diff --git a/esmvalcore/_recipe_checks.py b/esmvalcore/_recipe_checks.py
@@ -127,7 +127,7 @@ def data_availability(input_files, var, dirnames, filenames):
             f"Missing data for {var['alias']}: {var['short_name']}")
 
     # check time avail only for non-fx variables
-    if var['frequency'] == 'fx':
+    if var['frequency'] == 'fx' or 'all_years' in var:
         return
 
     required_years = set(range(var['start_year'], var['end_year'] + 1))

diff --git a/esmvalcore/recipe_schema.yml b/esmvalcore/recipe_schema.yml
@@ -41,7 +41,7 @@ variable:
   alternative_dataset: str(required=False)
   fx_files: list(required=False)
   additional_datasets: list(include('dataset'), required=False)
-
+  all_years: bool(required=False)
 # TODO: add preprocessor item
 
 diagnostic:

diff --git a/tests/integration/test_recipe.py b/tests/integration/test_recipe.py
@@ -632,6 +632,60 @@ def test_empty_variable(tmp_path, patched_datafinder, config_user):
     assert product.attributes['dataset'] == 'CanESM2'
 
 
+def test_all_years_tag(tmp_path, patched_datafinder, config_user):
+    """Test all_years tag for time-dependent variables."""
+    content = dedent("""
+        diagnostics:
+          diagnostic_name:
+            additional_datasets:
+              - dataset: CanESM2
+                project: CMIP5
+                mip: Amon
+                exp: historical
+                all_years: True
+                ensemble: r1i1p1
+            variables:
+              pr:
+            scripts: null
+        """)
+
+    recipe = get_recipe(tmp_path, content, config_user)
+    assert len(recipe.tasks) == 1
+    task = recipe.tasks.pop()
+    assert len(task.products) == 1
+    product = task.products.pop()
+    assert product.attributes['short_name'] == 'pr'
+    assert product.attributes['dataset'] == 'CanESM2'
+    assert '1990-2019' in product.filename
+
+
+def test_fx_all_years_tag(tmp_path, patched_datafinder, config_user):
+    """Test all_years tag does not break time-independent variables."""
+    content = dedent("""
+        diagnostics:
+          diagnostic_name:
+            additional_datasets:
+              - dataset: CanESM2
+                project: CMIP5
+                mip: fx
+                exp: historical
+                all_years: True
+                ensemble: r1i1p1
+            variables:
+              areacella:
+            scripts: null
+        """)
+
+    recipe = get_recipe(tmp_path, content, config_user)
+    assert len(recipe.tasks) == 1
+    task = recipe.tasks.pop()
+    assert len(task.products) == 1
+    product = task.products.pop()
+    assert product.attributes['short_name'] == 'areacella'
+    assert product.attributes['dataset'] == 'CanESM2'
+    assert '1990-2019' not in product.filename
+
+
 def test_cmip3_variable_autocomplete(tmp_path, patched_datafinder,
                                      config_user):
     """Test that required information is automatically added for CMIP5."""

diff --git a/tests/unit/data_finder/test_replace_tags.py b/tests/unit/data_finder/test_replace_tags.py
@@ -1,22 +1,65 @@
 """Tests for _replace_tags in _data_finder.py."""
-
 from esmvalcore._data_finder import _replace_tags
 
 VARIABLE = {
+    'project': 'CMIP6',
+    'dataset': 'ACCURATE-MODEL',
+    'activity': 'act',
+    'exp': 'experiment',
+    'institute': 'HMA',
+    'ensemble': 'r1i1p1f1',
+    'mip': 'Amon',
     'short_name': 'tas',
+    'grid': 'gr',
 }
 
 
-def test_replace_tags_str():
-    assert _replace_tags('folder/subfolder/{short_name}',
-                         VARIABLE) == ['folder/subfolder/tas']
+def test_replace_tags():
+    """Tests for get_start_end_year function."""
+    path = _replace_tags(
+        '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
+        '{grid}/{latestversion}', VARIABLE)
+    input_file = _replace_tags(
+        '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', VARIABLE)
+    output_file = _replace_tags(
+        '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', VARIABLE)
+    assert path == [
+        'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}'
+    ]
+    assert input_file == ['tas_Amon_ACCURATE-MODEL_experiment_r1i1p1f1_gr*.nc']
+    assert output_file == ['CMIP6_ACCURATE-MODEL_Amon_experiment_r1i1p1f1_tas']
 
 
 def test_replace_tags_list_of_str():
-    assert _replace_tags(('folder/subfolder/{short_name}',
-                          'folder2/{short_name}', 'subfolder/{short_name}'),
-                         VARIABLE) == [
-                             'folder/subfolder/tas',
-                             'folder2/tas',
-                             'subfolder/tas',
-                         ]
+    assert sorted(
+        _replace_tags(('folder/subfolder/{short_name}', 'folder2/{short_name}',
+                       'subfolder/{short_name}'), VARIABLE)) == sorted([
+                           'folder2/tas',
+                           'folder/subfolder/tas',
+                           'subfolder/tas',
+                       ])
+
+
+def test_replace_tags_with_subexperiment():
+    """Tests for get_start_end_year function."""
+    variable = {'sub_experiment': '199411', **VARIABLE}
+    path = _replace_tags(
+        '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
+        '{grid}/{latestversion}', variable)
+    input_file = _replace_tags(
+        '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', variable)
+    output_file = _replace_tags(
+        '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', variable)
+    assert sorted(path) == sorted([
+        'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}',
+        'act/HMA/ACCURATE-MODEL/experiment/199411-r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}'
+    ])
+    assert input_file == [
+        'tas_Amon_ACCURATE-MODEL_experiment_199411-r1i1p1f1_gr*.nc'
+    ]
+    assert output_file == [
+        'CMIP6_ACCURATE-MODEL_Amon_experiment_199411-r1i1p1f1_tas'
+    ]