From c7cb9f7eadb750618ea11121c2741b331fdf966c Mon Sep 17 00:00:00 2001
From: Anurag Nayak <nayakanurag078@gmail.com>
Date: Fri, 2 Aug 2024 20:59:21 +0530
Subject: [PATCH] added kerchunk as backend documentation (#9163)

* added kerchunk as backend documentation

* Update io.rst

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updated the io.rst file

* updated io.rst

* modified the combined.json file

* Apply suggestions from code review

* added new references

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixed some typos

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Justus Magin <keewis@users.noreply.github.com>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
---
 ci/requirements/doc.yml |  1 +
 doc/combined.json       | 30 +++++++++++++++++++++++
 doc/user-guide/io.rst   | 53 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 doc/combined.json

diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml
index cbbbaa16d7a..d0484c5e999 100644
--- a/ci/requirements/doc.yml
+++ b/ci/requirements/doc.yml
@@ -8,6 +8,7 @@ dependencies:
   - bottleneck
   - cartopy
   - cfgrib
+  - kerchunk
   - dask-core>=2022.1
   - dask-expr
   - hypothesis>=6.75.8
diff --git a/doc/combined.json b/doc/combined.json
new file mode 100644
index 00000000000..345462e055f
--- /dev/null
+++ b/doc/combined.json
@@ -0,0 +1,30 @@
+{
+    "version": 1,
+    "refs": {
+        ".zgroup": "{\"zarr_format\":2}",
+        "foo/.zarray": "{\"chunks\":[4,5],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":null,\"order\":\"C\",\"shape\":[4,5],\"zarr_format\":2}",
+        "foo/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\",\"y\"],\"coordinates\":\"z\"}",
+        "foo/0.0": [
+            "saved_on_disk.h5",
+            8192,
+            160
+        ],
+        "x/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "x/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}",
+        "x/0": [
+            "saved_on_disk.h5",
+            8352,
+            32
+        ],
+        "y/.zarray": "{\"chunks\":[5],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[5],\"zarr_format\":2}",
+        "y/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"y\"],\"calendar\":\"proleptic_gregorian\",\"units\":\"days since 2000-01-01 00:00:00\"}",
+        "y/0": [
+            "saved_on_disk.h5",
+            8384,
+            40
+        ],
+        "z/.zarray": "{\"chunks\":[4],\"compressor\":null,\"dtype\":\"|O\",\"fill_value\":null,\"filters\":[{\"allow_nan\":true,\"check_circular\":true,\"encoding\":\"utf-8\",\"ensure_ascii\":true,\"id\":\"json2\",\"indent\":null,\"separators\":[\",\",\":\"],\"skipkeys\":false,\"sort_keys\":true,\"strict\":true}],\"order\":\"C\",\"shape\":[4],\"zarr_format\":2}",
+        "z/0": "[\"a\",\"b\",\"c\",\"d\",\"|O\",[4]]",
+        "z/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}"
+    }
+}
diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
index 85c47334858..fabff1000d7 100644
--- a/doc/user-guide/io.rst
+++ b/doc/user-guide/io.rst
@@ -1060,6 +1060,59 @@ reads. Because this fall-back option is so much slower, xarray issues a
        instead of falling back to try reading non-consolidated metadata.
 
 
+.. _io.kerchunk:
+
+Kerchunk
+--------
+
+`Kerchunk <https://fsspec.github.io/kerchunk/index.html>`_ is a Python library
+that allows you to access chunked and compressed data formats (such as NetCDF3, NetCDF4, HDF5, GRIB2, TIFF & FITS),
+many of which are primary data formats for many data archives, by viewing the
+whole archive as an ephemeral `Zarr`_ dataset which allows for parallel, chunk-specific access.
+
+Instead of creating a new copy of the dataset in the Zarr spec/format or
+downloading the files locally, Kerchunk reads through the data archive and extracts the
+byte range and compression information of each chunk and saves as a ``reference``.
+These references are then saved as ``json`` files or ``parquet`` (more efficient)
+for later use. You can view some of these stored in the `references`
+directory `here <https://github.com/pydata/xarray-data>`_.
+
+
+.. note::
+    These references follow this `specification <https://fsspec.github.io/kerchunk/spec.html>`_.
+    Packages like `kerchunk`_ and `virtualizarr <https://github.com/zarr-developers/VirtualiZarr>`_
+    help in creating and reading these references.
+
+
+Reading these data archives becomes really easy with ``kerchunk`` in combination
+with ``xarray``, especially when these archives are large in size. A single combined
+reference can refer to thousands of the original data files present in these archives.
+You can view the whole dataset with from this `combined reference` using the above packages.
+
+The following example shows opening a combined references generated from a ``.hdf`` file stored locally.
+
+.. ipython:: python
+
+    storage_options = {
+        "target_protocol": "file",
+    }
+
+    # add the `remote_protocol` key in `storage_options` if you're accessing a file remotely
+
+    ds1 = xr.open_dataset(
+        "./combined.json",
+        engine="kerchunk",
+        storage_options=storage_options,
+    )
+
+    ds1
+
+.. note::
+
+    You can refer to the `project pythia kerchunk cookbook <https://projectpythia.org/kerchunk-cookbook/README.html>`_
+    and the `pangeo guide on kerchunk <https://guide.cloudnativegeo.org/kerchunk/intro.html>`_ for more information.
+
+
 .. _io.iris:
 
 Iris