Feature/issue-142: duplicated dimension error with TEMPO ozone profile (

#141) * add test for ozone profile proxy data * rework duplicate dimension removal to work with TEMPO ozone profile data * pylint update * simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()` * remove unused import per pylint * add test data files for TEMPO NO2 and O3PROF (contains duplicate dimension) * clean up comments * Revert "simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()`" This reverts commit e7b7096. * include Tuple import * update CHANGELOG.md * Revert "Revert "simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()`"" This reverts commit 3fe9c2a.
podaac · Feb 17, 2023 · 2bf1a2d · 2bf1a2d
1 parent 7cd60e3
commit 2bf1a2d
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
 ### Changed
+- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data.
 ### Deprecated 
 ### Removed
 ### Fixed

diff --git a/podaac/subsetter/dimension_cleanup.py b/podaac/subsetter/dimension_cleanup.py
@@ -18,7 +18,7 @@
 import xarray as xr
 
 
-def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]:
+def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
     """
     xarray cannot read netCDF4 datasets with duplicate dimensions.
     Function goes through a dataset to catch any variables with duplicate dimensions.
@@ -28,46 +28,69 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]
     """
     dup_vars = {}
     dup_new_varnames = []
+
     for var_name, var in nc_dataset.variables.items():
         dim_list = list(var.dimensions)
         if len(set(dim_list)) != len(dim_list):  # get true if var.dimensions has a duplicate
             dup_vars[var_name] = var  # populate dictionary with variables with vars with dup dims
+
     for dup_var_name, dup_var in dup_vars.items():
-        dim_list = list(dup_var.dimensions)  # list of original dimensions of variable with dup dims
-        # get the dimensions that are duplicated
+        dim_list = list(dup_var.dimensions)  # original dimensions of the variable with duplicated dims
+
+        # Dimension(s) that are duplicated are retrieved.
+        #   Note: this is not yet tested for more than one duplicated dimension.
         dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
+        dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)]  # length of the duplicated dimension
+
+        # New dimension and variable names are created.
         dim_dup_new = dim_dup+'_1'
         var_name_new = dup_var_name+'_1'
         dup_new_varnames.append(var_name_new)
 
-        # create new dimension by copying from the duplicated dimension
+        # The last dimension for the variable is replaced with the new name in a temporary list.
+        new_dim_list = dim_list[:-1]
+        new_dim_list.extend([dim_dup_new])
+
+        new_dup_var = {}
+        attrs_contents = {}
+
+        # Attributes for the original variable are retrieved.
+        for attrname in dup_var.ncattrs():
+            if attrname != '_FillValue':
+                attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname)
 
-        data = {}
         fill_value = dup_var._FillValue  # pylint: disable=W0212
-        nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
-        data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
-                                                      (dim_dup_new,), fill_value=fill_value)
 
-        for ncattr in nc_dataset.variables[dim_dup].ncattrs():
-            if ncattr != '_FillValue':
-                data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
-        data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
+        # Only create a new *Dimension* if it doesn't already exist.
+        if dim_dup_new not in nc_dataset.dimensions.keys():
 
-        new_dim_list = dim_list[:-1]
-        new_dim_list.extend([dim_dup_new])
+            # New dimension is created by copying from the duplicated dimension.
+            nc_dataset.createDimension(dim_dup_new, dim_dup_length)
 
-        # createVariable with new dimensions
+            # Only create a new dimension *Variable* if it existed originally in the NetCDF structure.
+            if dim_dup in nc_dataset.variables.keys():
 
-        data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
+                # New variable object is created for the renamed, previously duplicated dimension.
+                new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
+                                                                     (dim_dup_new,), fill_value=fill_value)
+                # New variable's attributes are set to the original ones.
+                for ncattr in nc_dataset.variables[dim_dup].ncattrs():
+                    if ncattr != '_FillValue':
+                        new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
+                new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
 
-        for attrname in dup_var.ncattrs():
-            if attrname != '_FillValue':
-                data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
-                data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
+        # Delete existing Variable
         del nc_dataset.variables[dup_var_name]
 
+        # Replace original *Variable* with new variable with no duplicated dimensions.
+        new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
+                                                              tuple(new_dim_list), fill_value=fill_value)
+        for attr_name, contents in attrs_contents.items():
+            new_dup_var[dup_var_name].setncattr(attr_name, contents)
+        new_dup_var[dup_var_name][:] = dup_var[:]
+
     # return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672
-    return nc_dataset, dup_new_varnames
+    return nc_dataset
 
 
 def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset:

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
@@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase
     return dataset, start_date
 
 
-def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
+def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]:
     """Open netcdf file, and flatten groups if they exist."""
     file_extension = filepath.split('.')[-1]
 
@@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
         if has_groups:
             nc_dataset = transform_grouped_dataset(nc_dataset, filepath)
 
-    nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset)
+    nc_dataset = dc.remove_duplicate_dims(nc_dataset)
 
-    return nc_dataset, rename_vars, has_groups
+    return nc_dataset, has_groups
 
 
 def override_decode_cf_datetime() -> None:
@@ -1071,7 +1071,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
         than one value in the case where there are multiple groups and
         different coordinate variables for each group.
     """
-    nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset)
+    nc_dataset, has_groups = open_as_nc_dataset(file_to_subset)
 
     override_decode_cf_datetime()
 
@@ -1099,7 +1099,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
             xr.backends.NetCDF4DataStore(nc_dataset),
             **args
     ) as dataset:
-        dataset = dc.rename_dup_vars(dataset, rename_vars)
         lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
             dataset=dataset,
             lat_var_names=lat_var_names,

diff --git a/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc b/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc
diff --git a/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc b/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc
diff --git a/tests/test_subset.py b/tests/test_subset.py
@@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request):
         output_file=subset_output_file
     )
 
-    out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file)
+    out_ds, _ = subset.open_as_nc_dataset(subset_output_file)
     out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
                              decode_times=False,
                              decode_coords=False,
@@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
     bbox = np.array(((-180, 180), (-90, 90)))
     output_file = "{}_{}".format(request.node.name, test_file)
 
-    in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
+    in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
     in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds),
                             decode_times=False,
                             decode_coords=False)
@@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
         variables=[var.replace(GROUP_DELIM, '/') for var in included_variables]
     )
 
-    out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
+    out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
     out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
                              decode_times=False,
                              decode_coords=False)
@@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir):
         'mask_and_scale': False,
         'decode_times': True
     }
-    ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
+    ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
     ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)
 
     lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
@@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request):
         assert variable.shape == \
                out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape
 
+def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request):
+    """
+    Check if TEMPO Ozone files run successfully even though
+    these files have variables with duplicate dimensions
+    """
+    TEMPO_dir = join(data_dir, 'TEMPO')
+    tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc'
+
+    bbox = np.array(((-180, 180), (-90, 90)))
+    output_file = "{}_{}".format(request.node.name, tempo_ozone_file)
+    shutil.copyfile(
+        os.path.join(TEMPO_dir, tempo_ozone_file),
+        os.path.join(subset_output_dir, tempo_ozone_file)
+    )
+    box_test = subset.subset(
+        file_to_subset=join(subset_output_dir, tempo_ozone_file),
+        bbox=bbox,
+        output_file=join(subset_output_dir, output_file)
+    )
+    # check if the box_test is
+
+    in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file))
+    out_nc = nc.Dataset(join(subset_output_dir, output_file))
+
+    for var_name, variable in in_nc.groups['support_data'].variables.items():
+        assert variable.shape == \
+               out_nc.groups['support_data'].variables[var_name].shape
+
 
 def test_omi_novars_subset(data_dir, subset_output_dir, request):
     """