Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue-142: duplicated dimension error with TEMPO ozone profile #141

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
### Changed
- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data.
### Deprecated
### Removed
### Fixed
Expand Down
65 changes: 44 additions & 21 deletions podaac/subsetter/dimension_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import xarray as xr


def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]:
def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
"""
xarray cannot read netCDF4 datasets with duplicate dimensions.
Function goes through a dataset to catch any variables with duplicate dimensions.
Expand All @@ -28,46 +28,69 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]
"""
dup_vars = {}
dup_new_varnames = []

for var_name, var in nc_dataset.variables.items():
dim_list = list(var.dimensions)
if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims

for dup_var_name, dup_var in dup_vars.items():
dim_list = list(dup_var.dimensions) # list of original dimensions of variable with dup dims
# get the dimensions that are duplicated
dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims

# Dimension(s) that are duplicated are retrieved.
# Note: this is not yet tested for more than one duplicated dimension.
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension

# New dimension and variable names are created.
dim_dup_new = dim_dup+'_1'
var_name_new = dup_var_name+'_1'
dup_new_varnames.append(var_name_new)

# create new dimension by copying from the duplicated dimension
# The last dimension for the variable is replaced with the new name in a temporary list.
new_dim_list = dim_list[:-1]
new_dim_list.extend([dim_dup_new])

new_dup_var = {}
attrs_contents = {}

# Attributes for the original variable are retrieved.
for attrname in dup_var.ncattrs():
if attrname != '_FillValue':
attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname)

data = {}
fill_value = dup_var._FillValue # pylint: disable=W0212
nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)

for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
# Only create a new *Dimension* if it doesn't already exist.
if dim_dup_new not in nc_dataset.dimensions.keys():

new_dim_list = dim_list[:-1]
new_dim_list.extend([dim_dup_new])
# New dimension is created by copying from the duplicated dimension.
nc_dataset.createDimension(dim_dup_new, dim_dup_length)

# createVariable with new dimensions
# Only create a new dimension *Variable* if it existed originally in the NetCDF structure.
if dim_dup in nc_dataset.variables.keys():

data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
# New variable object is created for the renamed, previously duplicated dimension.
new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)
# New variable's attributes are set to the original ones.
for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]

for attrname in dup_var.ncattrs():
if attrname != '_FillValue':
data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
# Delete existing Variable
del nc_dataset.variables[dup_var_name]

# Replace original *Variable* with new variable with no duplicated dimensions.
new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
tuple(new_dim_list), fill_value=fill_value)
for attr_name, contents in attrs_contents.items():
new_dup_var[dup_var_name].setncattr(attr_name, contents)
new_dup_var[dup_var_name][:] = dup_var[:]

# return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672
return nc_dataset, dup_new_varnames
return nc_dataset


def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset:
Expand Down
9 changes: 4 additions & 5 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase
return dataset, start_date


def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]:
"""Open netcdf file, and flatten groups if they exist."""
file_extension = filepath.split('.')[-1]

Expand All @@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
if has_groups:
nc_dataset = transform_grouped_dataset(nc_dataset, filepath)

nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset)
nc_dataset = dc.remove_duplicate_dims(nc_dataset)

return nc_dataset, rename_vars, has_groups
return nc_dataset, has_groups


def override_decode_cf_datetime() -> None:
Expand Down Expand Up @@ -1071,7 +1071,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
than one value in the case where there are multiple groups and
different coordinate variables for each group.
"""
nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset)
nc_dataset, has_groups = open_as_nc_dataset(file_to_subset)

override_decode_cf_datetime()

Expand Down Expand Up @@ -1099,7 +1099,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
xr.backends.NetCDF4DataStore(nc_dataset),
**args
) as dataset:
dataset = dc.rename_dup_vars(dataset, rename_vars)
lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
dataset=dataset,
lat_var_names=lat_var_names,
Expand Down
Binary file not shown.
Binary file not shown.
36 changes: 32 additions & 4 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request):
output_file=subset_output_file
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False,
Expand Down Expand Up @@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, test_file)

in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds),
decode_times=False,
decode_coords=False)
Expand All @@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
variables=[var.replace(GROUP_DELIM, '/') for var in included_variables]
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False)
Expand Down Expand Up @@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir):
'mask_and_scale': False,
'decode_times': True
}
ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)

lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
Expand Down Expand Up @@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request):
assert variable.shape == \
out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape

def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request):
"""
Check if TEMPO Ozone files run successfully even though
these files have variables with duplicate dimensions
"""
TEMPO_dir = join(data_dir, 'TEMPO')
tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc'

bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, tempo_ozone_file)
shutil.copyfile(
os.path.join(TEMPO_dir, tempo_ozone_file),
os.path.join(subset_output_dir, tempo_ozone_file)
)
box_test = subset.subset(
file_to_subset=join(subset_output_dir, tempo_ozone_file),
bbox=bbox,
output_file=join(subset_output_dir, output_file)
)
# check if the box_test is

in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file))
out_nc = nc.Dataset(join(subset_output_dir, output_file))

for var_name, variable in in_nc.groups['support_data'].variables.items():
assert variable.shape == \
out_nc.groups['support_data'].variables[var_name].shape


def test_omi_novars_subset(data_dir, subset_output_dir, request):
"""
Expand Down