diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index e0297a6cb..4b03a9f2c 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -1,98 +1,118 @@ ''' .. _streaming: -Streaming from an S3 Bucket -=========================== +Streaming NWB files +=================== -It is possible to read data directly from an S3 bucket, such as data from the `DANDI Archive -`_. This is especially useful for reading small pieces of data -from a large NWB file stored remotely. In fact, there are two different ways to do this supported by PyNWB. +You can read specific sections within individual data files directly from remote stores such as the +`DANDI Archive `_. This is especially useful for reading small pieces of data +from a large NWB file stored +remotely. First, you will need to get the location of the file. The code below illustrates how to do this on DANDI +using the dandi API library. -Method 1: ROS3 -~~~~~~~~~~~~~~ -ROS3 stands for "read only S3" and is a driver created by the HDF5 group that allows HDF5 to read HDF5 files -stored on s3. Using this method requires that your HDF5 library is installed with the ROS3 driver enabled. This -is not the default configuration, so you will need to make sure you install the right version of h5py that has this -advanced configuration enabled. You can install HDF5 with the ROS3 driver from `conda-forge -`_ using ``conda``. You may first need to uninstall a currently installed version of h5py. +Getting the location of the file on DANDI +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -''' +The :py:class:`~dandi.dandiapi.DandiAPIClient` can be used to get the S3 URL of any NWB file stored in the DANDI +Archive. If you have not already, install the latest release of the ``dandi`` package. + + +.. code-block:: bash + + pip install dandi + +Now you can get the url of a particular NWB file using the dandiset ID and the path of that file within the dandiset. + +.. code-block:: python + + from dandi.dandiapi import DandiAPIClient + + dandiset_id = '000006' # ephys dataset from the Svoboda Lab + filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb' # 450 kB file + with DandiAPIClient() as client: + asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) + s3_url = asset.get_content_url(follow_redirects=1, strip_query=True) + + +Streaming Method 1: ROS3 +~~~~~~~~~~~~~~~~~~~~~~~~ +ROS3 is one of the supported methods for reading data from a remote store. ROS3 stands for "read only S3" and is a +driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored remotely in s3 buckets. Using this method +requires that your HDF5 library is installed with the ROS3 driver enabled. This is not the default configuration, +so you will need to make sure you install the right version of ``h5py`` that has this advanced configuration enabled. +You can install HDF5 with the ROS3 driver from `conda-forge `_ using ``conda``. You may +first need to uninstall a currently installed version of ``h5py``. + +.. code-block:: bash + + pip uninstall h5py + conda install -c conda-forge "h5py>=3.2" + +Now instantiate a :py:class:`~pynwb.NWBHDF5IO` object with the S3 URL and specify the driver as "ros3". This +will download metadata about the file from the S3 bucket to memory. The values of datasets are accessed lazily, +just like when reading an NWB file stored locally. So, slicing into a dataset will require additional time to +download the sliced data (and only the sliced data) to memory. + +.. code-block:: python + + from pynwb import NWBHDF5IO -#################### -# .. code-block:: bash -# -# pip uninstall h5py -# conda install -c conda-forge "h5py>=3.2" -# - -#################### -# The ``DandiAPIClient`` can be used to get the S3 URL to an NWB file of interest stored in the DANDI Archive. -# If you have not already, install the latest release of the ``dandi`` package. -# -# .. code-block:: bash -# -# pip install dandi -# -# .. code-block:: python -# -# from dandi.dandiapi import DandiAPIClient -# -# dandiset_id = '000006' # ephys dataset from the Svoboda Lab -# filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb' # 450 kB file -# with DandiAPIClient() as client: -# asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath) -# s3_path = asset.get_content_url(follow_redirects=1, strip_query=True) - -#################### -# Finally, instantiate a :py:class:`~pynwb.NWBHDF5IO` object with the S3 URL and specify the driver as "ros3". This -# will download metadata about the file from the S3 bucket to memory. The values of datasets are accessed lazily, -# just like when reading an NWB file stored locally. So, slicing into a dataset will require additional time to -# download the sliced data (and only the sliced data) to memory. -# -# .. code-block:: python -# -# from pynwb import NWBHDF5IO -# -# with NWBHDF5IO(s3_path, mode='r', load_namespaces=True, driver='ros3') as io: -# nwbfile = io.read() -# print(nwbfile) -# print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) - -#################### -# Method 2: s3fs -# ~~~~~~~~~~~~~~ -# s3fs is a library that creates a virtual filesystem for an S3 store. With this approach, a virtual file is created -# for the file and virtual filesystem layer will take care of requesting data from the s3 bucket whenever data is -# read from the virtual file. -# -# First install s3fs: -# -# .. code-block:: bash -# -# pip install s3fs -# -# Then in Python: -# -# .. code-block:: python -# -# import s3fs -# import pynwb -# import h5py -# -# fs = s3fs.S3FileSystem(anon=True) -# -# f = fs.open("s3://dandiarchive/blobs/43b/f3a/43bf3a81-4a0b-433f-b471-1f10303f9d35", 'rb') -# file = h5py.File(f) -# io = pynwb.NWBHDF5IO(file=file, load_namespaces=True) -# -# io.read() -# -# The above snippet opens an arbitrary file on DANDI. You can use the ``DandiAPIClient`` to find the s3 path, -# but you will need to adjust this url to give it a prefix of "s3://dandiarchive/" as shown above. -# -# The s3fs approach has the advantage of being more robust that ROS3. Sometimes s3 requests are interrupted, -# and s3fs has internal mechanisms to retry these requests automatically, whereas ROS3 does not. However, it may not -# be available on all platforms. s3fs does not currently work for Windows. + with NWBHDF5IO(s3_url, mode='r', load_namespaces=True, driver='ros3') as io: + nwbfile = io.read() + print(nwbfile) + print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) +Streaming Method 2: fsspec +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +fsspec is another data streaming approach that is quite flexible and has several performance advantages. This library +creates a virtual filesystem for remote stores. With this approach, a virtual file is created for the file and +the virtual filesystem layer takes care of requesting data from the S3 bucket whenever data is +read from the virtual file. Note that this implementation is completely unaware of internals of the HDF5 format +and thus can work for **any** file, not only for the purpose of use with H5PY and PyNWB. + +First install ``fsspec`` and the dependencies of the :py:class:`~fsspec.implementations.http.HTTPFileSystem`: + +.. code-block:: bash + + pip install fsspec requests aiohttp + +Then in Python: + +.. code-block:: python + + import fsspec + import pynwb + import h5py + from fsspec.implementations.cached import CachingFileSystem + + # first, create a virtual filesystem based on the http protocol and use + # caching to save accessed data to RAM. + fs = CachingFileSystem( + fs=fsspec.filesystem("http"), + cache_storage="nwb-cache", # Local folder for the cache + ) + + # next, open the file + with fs.open(s3_url, "rb") as f: + with h5py.File(f) as file: + with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io: + nwbfile = io.read() + print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:]) + + +fsspec is a library that can be used to access a variety of different store formats, including (at the time of +writing): + +.. code-block:: python + + from fsspec.registry import known_implementations + known_implementations.keys() + +file, memory, dropbox, http, https, zip, tar, gcs, gs, gdrive, sftp, ssh, ftp, hdfs, arrow_hdfs, webhdfs, s3, s3a, wandb, oci, adl, abfs, az, cached, blockcache, filecache, simplecache, dask, dbfs, github, git, smb, jupyter, jlab, libarchive, reference + +The S3 backend, in particular, may provide additional functionality for accessing data on DANDI. See the +`fsspec documentation on known implementations `_ +for a full updated list of supported store formats. +''' # sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnails_streaming.png' diff --git a/docs/source/conf.py b/docs/source/conf.py index 8a95ec9dd..7878b875c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -143,6 +143,7 @@ def __call__(self, filename): 'hdmf': ('https://hdmf.readthedocs.io/en/latest/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'dandi': ('https://dandi.readthedocs.io/en/stable/', None), + 'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None), } extlinks = {'incf_lesson': ('https://training.incf.org/lesson/%s', ''),