From e7d5656e6da642958254eed6d466d2684ddb6e40 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Thu, 22 Aug 2024 12:19:30 -0500 Subject: [PATCH 1/3] Update pyhdf-based arrs to be manually tokenized This avoids a bug in dask or cloudpickle that alters the state of the pyhdf SDS object in some way making it unusable. --- satpy/readers/hdf4_utils.py | 24 +++++++++++++++++++----- satpy/readers/hdfeos_base.py | 2 +- satpy/readers/modis_l1b.py | 4 ++-- satpy/readers/modis_l2.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/satpy/readers/hdf4_utils.py b/satpy/readers/hdf4_utils.py index d6258d9d62..2b836e9f26 100644 --- a/satpy/readers/hdf4_utils.py +++ b/satpy/readers/hdf4_utils.py @@ -18,10 +18,12 @@ """Helpers for reading hdf4-based files.""" import logging +import os import dask.array as da import numpy as np import xarray as xr +from dask.base import tokenize from pyhdf.SD import SD, SDC, SDS from satpy.readers.file_handlers import BaseFileHandler @@ -45,12 +47,24 @@ } -def from_sds(var, *args, **kwargs): +def from_sds(var, src_path, *args, **kwargs): """Create a dask array from a SD dataset.""" - var.__dict__["dtype"] = np.dtype(HTYPE_TO_DTYPE[var.info()[3]]) - shape = var.info()[2] + var_info = var.info() + var.__dict__["dtype"] = np.dtype(HTYPE_TO_DTYPE[var_info[3]]) + shape = var_info[2] var.__dict__["shape"] = shape if isinstance(shape, (tuple, list)) else tuple(shape) - return da.from_array(var, *args, **kwargs) + + name = kwargs.pop("name", None) + if name is None: + var_name = var_info[0] + tokenize_args = (os.fspath(src_path), var_name) + if args: + tokenize_args += (args,) + if kwargs: + tokenize_args += (kwargs,) + # put variable name in the front for easier dask debugging + name = var_name + "-" + tokenize(*tokenize_args) + return da.from_array(var, *args, name=name, **kwargs) class HDF4FileHandler(BaseFileHandler): @@ -92,7 +106,7 @@ def collect_metadata(self, name, obj): def _open_xarray_dataset(self, val, chunks=CHUNK_SIZE): """Read the band in blocks.""" - dask_arr = from_sds(val, chunks=chunks) + dask_arr = from_sds(val, self.filename, chunks=chunks) attrs = val.attributes() return xr.DataArray(dask_arr, dims=("y", "x"), attrs=attrs) diff --git a/satpy/readers/hdfeos_base.py b/satpy/readers/hdfeos_base.py index 3fd920c01f..7c25e1d09a 100644 --- a/satpy/readers/hdfeos_base.py +++ b/satpy/readers/hdfeos_base.py @@ -216,7 +216,7 @@ def load_dataset(self, dataset_name, is_category=False): dataset = self._read_dataset_in_file(dataset_name) chunks = self._chunks_for_variable(dataset) - dask_arr = from_sds(dataset, chunks=chunks) + dask_arr = from_sds(dataset, self.filename, chunks=chunks) dims = ("y", "x") if dask_arr.ndim == 2 else None data = xr.DataArray(dask_arr, dims=dims, attrs=dataset.attributes()) diff --git a/satpy/readers/modis_l1b.py b/satpy/readers/modis_l1b.py index 8280b30065..17bf5d56ae 100644 --- a/satpy/readers/modis_l1b.py +++ b/satpy/readers/modis_l1b.py @@ -117,7 +117,7 @@ def get_dataset(self, key, info): var_attrs = subdata.attributes() uncertainty = self.sd.select(var_name + "_Uncert_Indexes") chunks = self._chunks_for_variable(subdata) - array = xr.DataArray(from_sds(subdata, chunks=chunks)[band_index, :, :], + array = xr.DataArray(from_sds(subdata, self.filename, chunks=chunks)[band_index, :, :], dims=["y", "x"]).astype(np.float32) valid_range = var_attrs["valid_range"] valid_min = np.float32(valid_range[0]) @@ -214,7 +214,7 @@ def _mask_uncertain_pixels(self, array, uncertainty, band_index): if not self._mask_saturated: return array uncertainty_chunks = self._chunks_for_variable(uncertainty) - band_uncertainty = from_sds(uncertainty, chunks=uncertainty_chunks)[band_index, :, :] + band_uncertainty = from_sds(uncertainty, self.filename, chunks=uncertainty_chunks)[band_index, :, :] array = array.where(band_uncertainty < 15) return array diff --git a/satpy/readers/modis_l2.py b/satpy/readers/modis_l2.py index 8fdf1c69bb..2f2555692d 100644 --- a/satpy/readers/modis_l2.py +++ b/satpy/readers/modis_l2.py @@ -111,7 +111,7 @@ def read_geo_resolution(metadata): def _select_hdf_dataset(self, hdf_dataset_name, byte_dimension): """Load a dataset from HDF-EOS level 2 file.""" dataset = self.sd.select(hdf_dataset_name) - dask_arr = from_sds(dataset, chunks=CHUNK_SIZE) + dask_arr = from_sds(dataset, self.filename, chunks=CHUNK_SIZE) attrs = dataset.attributes() dims = ["y", "x"] if byte_dimension == 0: From 8a9d85cce7cb46436c77caad32c3b2d77ffa1425 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 23 Aug 2024 09:46:27 -0500 Subject: [PATCH 2/3] Remove unnecessary *args from from_sds function --- satpy/readers/hdf4_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/satpy/readers/hdf4_utils.py b/satpy/readers/hdf4_utils.py index 2b836e9f26..17c162b5de 100644 --- a/satpy/readers/hdf4_utils.py +++ b/satpy/readers/hdf4_utils.py @@ -47,7 +47,7 @@ } -def from_sds(var, src_path, *args, **kwargs): +def from_sds(var, src_path, **kwargs): """Create a dask array from a SD dataset.""" var_info = var.info() var.__dict__["dtype"] = np.dtype(HTYPE_TO_DTYPE[var_info[3]]) @@ -58,8 +58,6 @@ def from_sds(var, src_path, *args, **kwargs): if name is None: var_name = var_info[0] tokenize_args = (os.fspath(src_path), var_name) - if args: - tokenize_args += (args,) if kwargs: tokenize_args += (kwargs,) # put variable name in the front for easier dask debugging From 5e27be4449305933393ac6ed5477d292eeb1ddec Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 23 Aug 2024 09:55:37 -0500 Subject: [PATCH 3/3] Fix missed use of *args --- satpy/readers/hdf4_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/readers/hdf4_utils.py b/satpy/readers/hdf4_utils.py index 17c162b5de..10f3b24b66 100644 --- a/satpy/readers/hdf4_utils.py +++ b/satpy/readers/hdf4_utils.py @@ -62,7 +62,7 @@ def from_sds(var, src_path, **kwargs): tokenize_args += (kwargs,) # put variable name in the front for easier dask debugging name = var_name + "-" + tokenize(*tokenize_args) - return da.from_array(var, *args, name=name, **kwargs) + return da.from_array(var, name=name, **kwargs) class HDF4FileHandler(BaseFileHandler):