Skip to content

Commit

Permalink
Feature/issue-142: duplicated dimension error with TEMPO ozone profile (
Browse files Browse the repository at this point in the history
#141)

* add test for ozone profile proxy data

* rework duplicate dimension removal to work with TEMPO ozone profile data

* pylint update

* simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()`

* remove unused import per pylint

* add test data files for TEMPO NO2 and O3PROF (contains duplicate dimension)

* clean up comments

* Revert "simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()`"

This reverts commit e7b7096.

* include Tuple import

* update CHANGELOG.md

* Revert "Revert "simplify return of `remove_duplicate_dims() and `open_as_nc_dataset()`""

This reverts commit 3fe9c2a.
  • Loading branch information
danielfromearth authored Feb 17, 2023
1 parent 7cd60e3 commit 2bf1a2d
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
### Changed
- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data.
### Deprecated
### Removed
### Fixed
Expand Down
65 changes: 44 additions & 21 deletions podaac/subsetter/dimension_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import xarray as xr


def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]:
def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
"""
xarray cannot read netCDF4 datasets with duplicate dimensions.
Function goes through a dataset to catch any variables with duplicate dimensions.
Expand All @@ -28,46 +28,69 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]
"""
dup_vars = {}
dup_new_varnames = []

for var_name, var in nc_dataset.variables.items():
dim_list = list(var.dimensions)
if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims

for dup_var_name, dup_var in dup_vars.items():
dim_list = list(dup_var.dimensions) # list of original dimensions of variable with dup dims
# get the dimensions that are duplicated
dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims

# Dimension(s) that are duplicated are retrieved.
# Note: this is not yet tested for more than one duplicated dimension.
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension

# New dimension and variable names are created.
dim_dup_new = dim_dup+'_1'
var_name_new = dup_var_name+'_1'
dup_new_varnames.append(var_name_new)

# create new dimension by copying from the duplicated dimension
# The last dimension for the variable is replaced with the new name in a temporary list.
new_dim_list = dim_list[:-1]
new_dim_list.extend([dim_dup_new])

new_dup_var = {}
attrs_contents = {}

# Attributes for the original variable are retrieved.
for attrname in dup_var.ncattrs():
if attrname != '_FillValue':
attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname)

data = {}
fill_value = dup_var._FillValue # pylint: disable=W0212
nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)

for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
# Only create a new *Dimension* if it doesn't already exist.
if dim_dup_new not in nc_dataset.dimensions.keys():

new_dim_list = dim_list[:-1]
new_dim_list.extend([dim_dup_new])
# New dimension is created by copying from the duplicated dimension.
nc_dataset.createDimension(dim_dup_new, dim_dup_length)

# createVariable with new dimensions
# Only create a new dimension *Variable* if it existed originally in the NetCDF structure.
if dim_dup in nc_dataset.variables.keys():

data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
# New variable object is created for the renamed, previously duplicated dimension.
new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)
# New variable's attributes are set to the original ones.
for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]

for attrname in dup_var.ncattrs():
if attrname != '_FillValue':
data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
# Delete existing Variable
del nc_dataset.variables[dup_var_name]

# Replace original *Variable* with new variable with no duplicated dimensions.
new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
tuple(new_dim_list), fill_value=fill_value)
for attr_name, contents in attrs_contents.items():
new_dup_var[dup_var_name].setncattr(attr_name, contents)
new_dup_var[dup_var_name][:] = dup_var[:]

# return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672
return nc_dataset, dup_new_varnames
return nc_dataset


def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset:
Expand Down
9 changes: 4 additions & 5 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase
return dataset, start_date


def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]:
"""Open netcdf file, and flatten groups if they exist."""
file_extension = filepath.split('.')[-1]

Expand All @@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
if has_groups:
nc_dataset = transform_grouped_dataset(nc_dataset, filepath)

nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset)
nc_dataset = dc.remove_duplicate_dims(nc_dataset)

return nc_dataset, rename_vars, has_groups
return nc_dataset, has_groups


def override_decode_cf_datetime() -> None:
Expand Down Expand Up @@ -1071,7 +1071,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
than one value in the case where there are multiple groups and
different coordinate variables for each group.
"""
nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset)
nc_dataset, has_groups = open_as_nc_dataset(file_to_subset)

override_decode_cf_datetime()

Expand Down Expand Up @@ -1099,7 +1099,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
xr.backends.NetCDF4DataStore(nc_dataset),
**args
) as dataset:
dataset = dc.rename_dup_vars(dataset, rename_vars)
lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
dataset=dataset,
lat_var_names=lat_var_names,
Expand Down
Binary file not shown.
Binary file not shown.
36 changes: 32 additions & 4 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request):
output_file=subset_output_file
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False,
Expand Down Expand Up @@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, test_file)

in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds),
decode_times=False,
decode_coords=False)
Expand All @@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
variables=[var.replace(GROUP_DELIM, '/') for var in included_variables]
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False)
Expand Down Expand Up @@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir):
'mask_and_scale': False,
'decode_times': True
}
ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)

lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
Expand Down Expand Up @@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request):
assert variable.shape == \
out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape

def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request):
"""
Check if TEMPO Ozone files run successfully even though
these files have variables with duplicate dimensions
"""
TEMPO_dir = join(data_dir, 'TEMPO')
tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc'

bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, tempo_ozone_file)
shutil.copyfile(
os.path.join(TEMPO_dir, tempo_ozone_file),
os.path.join(subset_output_dir, tempo_ozone_file)
)
box_test = subset.subset(
file_to_subset=join(subset_output_dir, tempo_ozone_file),
bbox=bbox,
output_file=join(subset_output_dir, output_file)
)
# check if the box_test is

in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file))
out_nc = nc.Dataset(join(subset_output_dir, output_file))

for var_name, variable in in_nc.groups['support_data'].variables.items():
assert variable.shape == \
out_nc.groups['support_data'].variables[var_name].shape


def test_omi_novars_subset(data_dir, subset_output_dir, request):
"""
Expand Down

0 comments on commit 2bf1a2d

Please sign in to comment.