diff --git a/satpy/readers/fci_l1c_fdhsi.py b/satpy/readers/fci_l1c_fdhsi.py index 43ea8c48a3..60ffab649e 100644 --- a/satpy/readers/fci_l1c_fdhsi.py +++ b/satpy/readers/fci_l1c_fdhsi.py @@ -56,7 +56,9 @@ class using the :mod:`~satpy.Scene.load` method with the reader def __init__(self, filename, filename_info, filetype_info): """Initialize file handler.""" super(FCIFDHSIFileHandler, self).__init__(filename, filename_info, - filetype_info) + filetype_info, + cache_var_size=10000, + cache_handle=True) logger.debug('Reading: {}'.format(self.filename)) logger.debug('Start: {}'.format(self.start_time)) logger.debug('End: {}'.format(self.end_time)) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 4194b66ac9..6e266b8e85 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -19,7 +19,9 @@ import netCDF4 import logging +import numpy as np import xarray as xr +import dask.array as da from satpy import CHUNK_SIZE from satpy.readers.file_handlers import BaseFileHandler @@ -49,19 +51,44 @@ class NetCDF4FileHandler(BaseFileHandler): wrapper["/attr/platform_short_name"] - Note that loading datasets requires reopening the original file, but to - get just the shape of the dataset append "/shape" to the item string: + Note that loading datasets requires reopening the original file + (unless those datasets are cached, see below), but to get just the + shape of the dataset append "/shape" to the item string: wrapper["group/subgroup/var_name/shape"] + If your file has many small data variables that are frequently accessed, + you may choose to cache some of them. You can do this by passing a number, + any variable smaller than this number in bytes will be read into RAM. + Warning, this part of the API is provisional and subject to change. + + You may get an additional speedup by passing ``cache_handle=True``. This + will keep the netCDF4 dataset handles open throughout the lifetime of the + object, and instead of using `xarray.open_dataset` to open every data + variable, a dask array will be created "manually". This may be useful if + you have a dataset distributed over many files, such as for FCI. Note + that the coordinates will be missing in this case. If you use this option, + ``xarray_kwargs`` will have no effect. + + Args: + filename (str): File to read + filename_info (dict): Dictionary with filename information + filetype_info (dict): Dictionary with filetype information + auto_maskandscale (bool): Apply mask and scale factors + xarray_kwargs (dict): Addition arguments to `xarray.open_dataset` + cache_var_size (int): Cache variables smaller than this size. + cache_handle (bool): Keep files open for lifetime of filehandler. """ + file_handle = None + def __init__(self, filename, filename_info, filetype_info, - auto_maskandscale=False, xarray_kwargs=None): - """Initilize file handler.""" + auto_maskandscale=False, xarray_kwargs=None, + cache_var_size=0, cache_handle=False): super(NetCDF4FileHandler, self).__init__( filename, filename_info, filetype_info) self.file_content = {} + self.cached_file_content = {} try: file_handle = netCDF4.Dataset(self.filename, 'r') except IOError: @@ -75,11 +102,29 @@ def __init__(self, filename, filename_info, filetype_info, self.collect_metadata("", file_handle) self.collect_dimensions("", file_handle) - file_handle.close() + if cache_var_size > 0: + self.collect_cache_vars( + [varname for (varname, var) + in self.file_content.items() + if isinstance(var, netCDF4.Variable) + and isinstance(var.dtype, np.dtype) # vlen may be str + and var.size * var.dtype.itemsize < cache_var_size], + file_handle) + if cache_handle: + self.file_handle = file_handle + else: + file_handle.close() self._xarray_kwargs = xarray_kwargs or {} self._xarray_kwargs.setdefault('chunks', CHUNK_SIZE) self._xarray_kwargs.setdefault('mask_and_scale', self.auto_maskandscale) + def __del__(self): + if self.file_handle is not None: + try: + self.file_handle.close() + except RuntimeError: # presumably closed already + pass + def _collect_attrs(self, name, obj): """Collect all the attributes for the provided file object.""" for key in obj.ncattrs(): @@ -113,10 +158,31 @@ def collect_dimensions(self, name, obj): dim_name = "{}/dimension/{}".format(name, dim_name) self.file_content[dim_name] = len(dim_obj) + def collect_cache_vars(self, cache_vars, obj): + """Collect data variables for caching. + + This method will collect some data variables and store them in RAM. + This may be useful if some small variables are frequently accessed, + to prevent needlessly frequently opening and closing the file, which + in case of xarray is associated with some overhead. + + Should be called later than `collect_metadata`. + + Args: + cache_vars (List[str]): Names of data variables to be cached. + obj (netCDF4.Dataset): Dataset object from which to read them. + """ + for var_name in cache_vars: + v = self.file_content[var_name] + self.cached_file_content[var_name] = xr.DataArray( + v[:], dims=v.dimensions, attrs=v.__dict__, name=v.name) + def __getitem__(self, key): """Get item for given key.""" val = self.file_content[key] if isinstance(val, netCDF4.Variable): + if key in self.cached_file_content: + return self.cached_file_content[key] # these datasets are closed and inaccessible when the file is # closed, need to reopen # TODO: Handle HDF4 versus NetCDF3 versus NetCDF4 @@ -125,21 +191,38 @@ def __getitem__(self, key): group, key = parts else: group = None - with xr.open_dataset(self.filename, group=group, - **self._xarray_kwargs) as nc: - val = nc[key] - # Even though `chunks` is specified in the kwargs, xarray - # uses dask.arrays only for data variables that have at least - # one dimension; for zero-dimensional data variables (scalar), - # it uses its own lazy loading for scalars. When those are - # accessed after file closure, xarray reopens the file without - # closing it again. This will leave potentially many open file - # objects (which may in turn trigger a Segmentation Fault: - # https://github.com/pydata/xarray/issues/2954#issuecomment-491221266 - if not val.chunks: - val.load() + if self.file_handle is not None: + val = self._get_var_from_filehandle(group, key) + else: + val = self._get_var_from_xr(group, key) return val + def _get_var_from_xr(self, group, key): + with xr.open_dataset(self.filename, group=group, + **self._xarray_kwargs) as nc: + val = nc[key] + # Even though `chunks` is specified in the kwargs, xarray + # uses dask.arrays only for data variables that have at least + # one dimension; for zero-dimensional data variables (scalar), + # it uses its own lazy loading for scalars. When those are + # accessed after file closure, xarray reopens the file without + # closing it again. This will leave potentially many open file + # objects (which may in turn trigger a Segmentation Fault: + # https://github.com/pydata/xarray/issues/2954#issuecomment-491221266 + if not val.chunks: + val.load() + return val + + def _get_var_from_filehandle(self, group, key): + # Not getting coordinates as this is more work, therefore more + # overhead, and those are not used downstream. + g = self.file_handle[group] + v = g[key] + x = xr.DataArray( + da.from_array(v), dims=v.dimensions, attrs=v.__dict__, + name=v.name) + return x + def __contains__(self, item): """Get item from file content.""" return item in self.file_content diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index aa16774965..0204d88dc9 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -93,6 +93,11 @@ def setUp(self): ds2_i = nc.createVariable('ds2_i', np.int32, dimensions=('rows', 'cols')) ds2_i[:] = np.arange(10 * 100).reshape((10, 100)) + ds2_s = nc.createVariable("ds2_s", np.int8, + dimensions=("rows",)) + ds2_s[:] = np.arange(10) + ds2_sc = nc.createVariable("ds2_sc", np.int8, dimensions=()) + ds2_sc[:] = 42 # Add attributes nc.test_attr_str = 'test_string' @@ -138,6 +143,34 @@ def test_all_basic(self): self.assertTrue('ds2_f' in file_handler) self.assertFalse('fake_ds' in file_handler) + self.assertIsNone(file_handler.file_handle) + self.assertEqual(file_handler["ds2_sc"], 42) + + def test_caching(self): + """Test that caching works as intended. + """ + from satpy.readers.netcdf_utils import NetCDF4FileHandler + h = NetCDF4FileHandler("test.nc", {}, {}, cache_var_size=1000, + cache_handle=True) + self.assertIsNotNone(h.file_handle) + self.assertTrue(h.file_handle.isopen()) + + self.assertEqual(sorted(h.cached_file_content.keys()), + ["ds2_s", "ds2_sc"]) + # with caching, these tests access different lines than without + np.testing.assert_array_equal(h["ds2_s"], np.arange(10)) + np.testing.assert_array_equal(h["test_group/ds1_i"], + np.arange(10 * 100).reshape((10, 100))) + h.__del__() + self.assertFalse(h.file_handle.isopen()) + + def test_filenotfound(self): + """Test that error is raised when file not found + """ + from satpy.readers.netcdf_utils import NetCDF4FileHandler + + with self.assertRaises(IOError): + NetCDF4FileHandler("/thisfiledoesnotexist.nc", {}, {}) def suite():