From 4a945ec111cff4ad0dff42049dd42c56d11fc6ac Mon Sep 17 00:00:00 2001 From: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> Date: Tue, 24 Jan 2023 12:07:20 +0000 Subject: [PATCH] =?UTF-8?q?Iris=20=E2=9D=A4=20Xarray=20docs=20page.=20(#50?= =?UTF-8?q?25)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Iris Xarray docs page. * Add links. * Xarray page styling. * What's New entry. * Minor docs fixes. * Overall experience section. * Xarray supports other plotting backends through external packages. Co-authored-by: Deepak Cherian * Section on converting between Iris and Xarray. * Clearer language around laziness and multi-processing. * To-do note about dates and fill values. * Move iris_xarray page into a new Community section. * Language fixes from @bjlittle review. Co-authored-by: Deepak Cherian --- docs/src/common_links.inc | 1 + docs/src/community/index.rst | 48 ++++++ docs/src/community/iris_xarray.rst | 154 ++++++++++++++++++ docs/src/conf.py | 1 + .../further_topics/ugrid/partner_packages.rst | 3 +- docs/src/index.rst | 9 + docs/src/whatsnew/latest.rst | 8 + lib/iris/_lazy_data.py | 20 ++- lib/iris/cube.py | 3 +- lib/iris/experimental/ugrid/mesh.py | 4 +- lib/iris/pandas.py | 4 +- lib/iris/util.py | 7 +- 12 files changed, 244 insertions(+), 18 deletions(-) create mode 100644 docs/src/community/index.rst create mode 100644 docs/src/community/iris_xarray.rst diff --git a/docs/src/common_links.inc b/docs/src/common_links.inc index 530ebc4877..bf1567e917 100644 --- a/docs/src/common_links.inc +++ b/docs/src/common_links.inc @@ -40,6 +40,7 @@ .. _CF-UGRID: https://ugrid-conventions.github.io/ugrid-conventions/ .. _issues on GitHub: https://github.com/SciTools/iris/issues?q=is%3Aopen+is%3Aissue+sort%3Areactions-%2B1-desc .. _python-stratify: https://github.com/SciTools/python-stratify +.. _iris-esmf-regrid: https://github.com/SciTools-incubator/iris-esmf-regrid .. comment diff --git a/docs/src/community/index.rst b/docs/src/community/index.rst new file mode 100644 index 0000000000..a9e2a4d040 --- /dev/null +++ b/docs/src/community/index.rst @@ -0,0 +1,48 @@ +.. include:: ../common_links.inc + +.. todo: + consider scientific-python.org + consider scientific-python.org/specs/ + +Iris in the Community +===================== + +Iris aims to be a valuable member of the open source scientific Python +community. + +We listen out for developments in our dependencies and neighbouring projects, +and we reach out to them when we can solve problems together; please feel free +to reach out to us! + +We are aware of our place in the user's wider 'toolbox' - offering unique +functionality and interoperating smoothly with other packages. + +We welcome contributions from all; whether that's an opinion, a 1-line +clarification, or a whole new feature 🙂 + +Quick Links +----------- + +* `GitHub Discussions`_ +* :ref:`Getting involved` +* `Twitter `_ + +Interoperability +---------------- + +There's a big choice of Python tools out there! Each one has strengths and +weaknesses in different areas, so we don't want to force a single choice for your +whole workflow - we'd much rather make it easy for you to choose the right tool +for the moment, switching whenever you need. Below are our ongoing efforts at +smoother interoperability: + +.. not using toctree due to combination of child pages and cross-references. + +* The :mod:`iris.pandas` module +* :doc:`iris_xarray` + +.. toctree:: + :maxdepth: 1 + :hidden: + + iris_xarray diff --git a/docs/src/community/iris_xarray.rst b/docs/src/community/iris_xarray.rst new file mode 100644 index 0000000000..859597da78 --- /dev/null +++ b/docs/src/community/iris_xarray.rst @@ -0,0 +1,154 @@ +.. include:: ../common_links.inc + +====================== +Iris ❤️ :term:`Xarray` +====================== + +There is a lot of overlap between Iris and :term:`Xarray`, but some important +differences too. Below is a summary of the most important differences, so that +you can be prepared, and to help you choose the best package for your use case. + +Overall Experience +------------------ + +Iris is the more specialised package, focussed on making it as easy +as possible to work with meteorological and climatological data. Iris +is built to natively handle many key concepts, such as the CF conventions, +coordinate systems and bounded coordinates. Iris offers a smaller toolkit of +operations compared to Xarray, particularly around API for sophisticated +computation such as array manipulation and multi-processing. + +Xarray's more generic data model and community-driven development give it a +richer range of operations and broader possible uses. Using Xarray +specifically for meteorology/climatology may require deeper knowledge +compared to using Iris, and you may prefer to add Xarray plugins +such as :ref:`cfxarray` to get the best experience. Advanced users can likely +achieve better performance with Xarray than with Iris. + +Conversion +---------- +There are multiple ways to convert between Iris and Xarray objects. + +* Xarray includes the :meth:`~xarray.DataArray.to_iris` and + :meth:`~xarray.DataArray.from_iris` methods - detailed in the + `Xarray IO notes on Iris`_. Since Iris evolves independently of Xarray, be + vigilant for concepts that may be lost during the conversion. +* Because both packages are closely linked to the :term:`NetCDF Format`, it is + feasible to save a NetCDF file using one package then load that file using + the other package. This will be lossy in places, as both Iris and Xarray + are opinionated on how certain NetCDF concepts relate to their data models. +* The Iris development team are exploring an improved 'bridge' between the two + packages. Follow the conversation on GitHub: `iris#4994`_. This project is + expressly intended to be as lossless as possible. + +Regridding +---------- +Iris and Xarray offer a range of regridding methods - both natively and via +additional packages such as `iris-esmf-regrid`_ and `xESMF`_ - which overlap +in places +but tend to cover a different set of use cases (e.g. Iris handles unstructured +meshes but offers access to fewer ESMF methods). The behaviour of these +regridders also differs slightly (even between different regridders attached to +the same package) so the appropriate package to use depends highly on the +particulars of the use case. + +Plotting +-------- +Xarray and Iris have a large overlap of functionality when creating +:term:`Matplotlib` plots and both support the plotting of multidimensional +coordinates. This means the experience is largely similar using either package. + +Xarray supports further plotting backends through external packages (e.g. Bokeh through `hvPlot`_) +and, if a user is already familiar with `pandas`_, the interface should be +familiar. It also supports some different plot types to Iris, and therefore can +be used for a wider variety of plots. It also has benefits regarding "out of +the box", quick customisations to plots. However, if further customisation is +required, knowledge of matplotlib is still required. + +In both cases, :term:`Cartopy` is/can be used. Iris does more work +automatically for the user here, creating Cartopy +:class:`~cartopy.mpl.geoaxes.GeoAxes` for latitude and longitude coordinates, +whereas the user has to do this manually in Xarray. + +Statistics +---------- +Both libraries are quite comparable with generally similar capabilities, +performance and laziness. Iris offers more specificity in some cases, such as +some more specific unique functions and masked tolerance in most statistics. +Xarray seems more approachable however, with some less unique but more +convenient solutions (these tend to be wrappers to :term:`Dask` functions). + +Laziness and Multi-Processing with :term:`Dask` +----------------------------------------------- +Iris and Xarray both support lazy data and out-of-core processing through +utilisation of Dask. + +While both Iris and Xarray expose :term:`NumPy` conveniences at the API level +(e.g. the `ndim()` method), only Xarray exposes Dask conveniences. For example +:attr:`xarray.DataArray.chunks`, which gives the user direct control +over the underlying Dask array chunks. The Iris API instead takes control of +such concepts and user control is only possible by manipulating the underlying +Dask array directly (accessed via :meth:`iris.cube.Cube.core_data`). + +:class:`xarray.DataArray`\ s comply with `NEP-18`_, allowing NumPy arrays to be +based on them, and they also include the necessary extra members for Dask +arrays to be based on them too. Neither of these is currently possible with +Iris :class:`~iris.cube.Cube`\ s, although an ambition for the future. + +NetCDF File Control +------------------- +(More info: :term:`NetCDF Format`) + +Unlike Iris, Xarray generally provides full control of major file structures, +i.e. dimensions + variables, including their order in the file. It mostly +respects these in a file input, and can reproduce them on output. +However, attribute handling is not so complete: like Iris, it interprets and +modifies some recognised aspects, and can add some extra attributes not in the +input. + +.. todo: + More detail on dates and fill values (@pp-mo suggestion). + +Handling of dates and fill values have some special problems here. + +Ultimately, nearly everything wanted in a particular desired result file can +be achieved in Xarray, via provided override mechanisms (`loading keywords`_ +and the '`encoding`_' dictionaries). + +Missing Data +------------ +Xarray uses :data:`numpy.nan` to represent missing values and this will support +many simple use cases assuming the data are floats. Iris enables more +sophisticated missing data handling by representing missing values as masks +(:class:`numpy.ma.MaskedArray` for real data and :class:`dask.array.Array` +for lazy data) which allows data to be any data type and to include either/both +a mask and :data:`~numpy.nan`\ s. + +.. _cfxarray: + +`cf-xarray`_ +------------- +Iris has a data model entirely based on :term:`CF Conventions`. Xarray has a +data model based on :term:`NetCDF Format` with cf-xarray acting as translation +into CF. Xarray/cf-xarray methods can be +called and data accessed with CF like arguments (e.g. axis, standard name) and +there are some CF specific utilities (similar +to Iris utilities). Iris tends to cover more of and be stricter about CF. + + +.. seealso:: + + * `Xarray IO notes on Iris`_ + * `Xarray notes on other NetCDF libraries`_ + +.. _Xarray IO notes on Iris: https://docs.xarray.dev/en/stable/user-guide/io.html#iris +.. _Xarray notes on other NetCDF libraries: https://docs.xarray.dev/en/stable/getting-started-guide/faq.html#what-other-netcdf-related-python-libraries-should-i-know-about +.. _loading keywords: https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray.open_dataset +.. _encoding: https://docs.xarray.dev/en/stable/user-guide/io.html#writing-encoded-data +.. _xESMF: https://github.com/pangeo-data/xESMF/ +.. _seaborn: https://seaborn.pydata.org/ +.. _hvPlot: https://hvplot.holoviz.org/ +.. _pandas: https://pandas.pydata.org/ +.. _NEP-18: https://numpy.org/neps/nep-0018-array-function-protocol.html +.. _cf-xarray: https://github.com/xarray-contrib/cf-xarray +.. _iris#4994: https://github.com/SciTools/iris/issues/4994 diff --git a/docs/src/conf.py b/docs/src/conf.py index 3636f94d1b..576a099b90 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -223,6 +223,7 @@ def _dotv(version): "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), "pandas": ("https://pandas.pydata.org/docs/", None), + "dask": ("https://docs.dask.org/en/stable/", None), } # The name of the Pygments (syntax highlighting) style to use. diff --git a/docs/src/further_topics/ugrid/partner_packages.rst b/docs/src/further_topics/ugrid/partner_packages.rst index 8e36f4ffc2..75b54b037f 100644 --- a/docs/src/further_topics/ugrid/partner_packages.rst +++ b/docs/src/further_topics/ugrid/partner_packages.rst @@ -1,3 +1,5 @@ +.. include:: ../../common_links.inc + .. _ugrid partners: Iris' Mesh Partner Packages @@ -97,4 +99,3 @@ Applications .. _GeoVista: https://github.com/bjlittle/geovista .. _PyVista: https://docs.pyvista.org/index.html -.. _iris-esmf-regrid: https://github.com/SciTools-incubator/iris-esmf-regrid diff --git a/docs/src/index.rst b/docs/src/index.rst index c5d654ed31..531c0e0b26 100644 --- a/docs/src/index.rst +++ b/docs/src/index.rst @@ -136,6 +136,15 @@ The legacy support resources: developers_guide/contributing_getting_involved +.. toctree:: + :caption: Community + :maxdepth: 1 + :name: community_index + :hidden: + + Community + + .. toctree:: :caption: Iris API :maxdepth: 1 diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 2cdc9cf1f5..903e6880f0 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -82,6 +82,14 @@ This document explains the changes made to Iris for this release and removed an ECMWF link in the ``v1.0`` What's New that was failing the linkcheck CI. (:pull:`5109`) +#. `@trexfeathers`_ added a new top-level :doc:`/community/index` section, + as a one-stop place to find out about getting involved, and how we relate + to other projects. (:pull:`5025`) + +#. The **Iris community**, with help from the **Xarray community**, produced + the :doc:`/community/iris_xarray` page, highlighting the similarities and + differences between the two packages. (:pull:`5025`) + 💼 Internal =========== diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index ac7ae34511..e0566fc8f2 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -39,7 +39,7 @@ def is_lazy_data(data): """ Return whether the argument is an Iris 'lazy' data array. - At present, this means simply a Dask array. + At present, this means simply a :class:`dask.array.Array`. We determine this by checking for a "compute" property. """ @@ -67,7 +67,8 @@ def _optimum_chunksize_internals( * shape (tuple of int): The full array shape of the target data. * limit (int): - The 'ideal' target chunk size, in bytes. Default from dask.config. + The 'ideal' target chunk size, in bytes. Default from + :mod:`dask.config`. * dtype (np.dtype): Numpy dtype of target data. @@ -77,7 +78,7 @@ def _optimum_chunksize_internals( .. note:: The purpose of this is very similar to - `dask.array.core.normalize_chunks`, when called as + :func:`dask.array.core.normalize_chunks`, when called as `(chunks='auto', shape, dtype=dtype, previous_chunks=chunks, ...)`. Except, the operation here is optimised specifically for a 'c-like' dimension order, i.e. outer dimensions first, as for netcdf variables. @@ -174,13 +175,13 @@ def _optimum_chunksize( def as_lazy_data(data, chunks=None, asarray=False): """ - Convert the input array `data` to a dask array. + Convert the input array `data` to a :class:`dask.array.Array`. Args: * data (array-like): An indexable object with 'shape', 'dtype' and 'ndim' properties. - This will be converted to a dask array. + This will be converted to a :class:`dask.array.Array`. Kwargs: @@ -192,7 +193,7 @@ def as_lazy_data(data, chunks=None, asarray=False): Set to False (default) to pass passed chunks through unchanged. Returns: - The input array converted to a dask array. + The input array converted to a :class:`dask.array.Array`. .. note:: The result chunk size is a multiple of 'chunks', if given, up to the @@ -284,15 +285,16 @@ def multidim_lazy_stack(stack): """ Recursively build a multidimensional stacked dask array. - This is needed because dask.array.stack only accepts a 1-dimensional list. + This is needed because :meth:`dask.array.Array.stack` only accepts a + 1-dimensional list. Args: * stack: - An ndarray of dask arrays. + An ndarray of :class:`dask.array.Array`. Returns: - The input array converted to a lazy dask array. + The input array converted to a lazy :class:`dask.array.Array`. """ if stack.ndim == 0: diff --git a/lib/iris/cube.py b/lib/iris/cube.py index be01bc9e5d..c60896f718 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -884,7 +884,8 @@ def __init__( This object defines the shape of the cube and the phenomenon value in each cell. - ``data`` can be a dask array, a NumPy array, a NumPy array + ``data`` can be a :class:`dask.array.Array`, a + :class:`numpy.ndarray`, a NumPy array subclass (such as :class:`numpy.ma.MaskedArray`), or array_like (as described in :func:`numpy.asarray`). diff --git a/lib/iris/experimental/ugrid/mesh.py b/lib/iris/experimental/ugrid/mesh.py index 4fd09175af..d18e1b2026 100644 --- a/lib/iris/experimental/ugrid/mesh.py +++ b/lib/iris/experimental/ugrid/mesh.py @@ -131,7 +131,7 @@ def __init__( Args: - * indices (numpy.ndarray or numpy.ma.core.MaskedArray or dask.array.Array): + * indices (:class:`numpy.ndarray` or :class:`numpy.ma.core.MaskedArray` or :class:`dask.array.Array`): 2D array giving the topological connection relationship between :attr:`location` elements and :attr:`connected` elements. The :attr:`location_axis` dimension indexes over the @@ -501,7 +501,7 @@ def core_indices(self): NumPy array or a Dask array. Returns: - numpy.ndarray or numpy.ma.core.MaskedArray or dask.array.Array + :class:`numpy.ndarray` or :class:`numpy.ma.core.MaskedArray` or :class:`dask.array.Array` """ return super()._core_values() diff --git a/lib/iris/pandas.py b/lib/iris/pandas.py index faa250285e..417b6b11de 100644 --- a/lib/iris/pandas.py +++ b/lib/iris/pandas.py @@ -238,7 +238,7 @@ def as_cubes( A :class:`~pandas.DataFrame` using columns as a second data dimension will need to be 'melted' before conversion. See the Examples for how. - Dask ``DataFrame``\\s are not supported. + :class:`dask.dataframe.DataFrame`\\ s are not supported. Examples -------- @@ -686,7 +686,7 @@ def as_data_frame( Notes ----- - Dask ``DataFrame``\\s are not supported. + :class:`dask.dataframe.DataFrame`\\ s are not supported. A :class:`~pandas.MultiIndex` :class:`~pandas.DataFrame` is returned by default. Use the :meth:`~pandas.DataFrame.reset_index` to return a diff --git a/lib/iris/util.py b/lib/iris/util.py index 3d82ea68c5..1cf41e6fb6 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -1815,8 +1815,9 @@ def _mask_array(array, points_to_mask, in_place=False): If array is lazy then in_place is ignored: _math_op_common will use the returned value regardless of in_place, so we do not need to implement it - here. If in_place is True then array must be a np.ma.MaskedArray or dask - array (must be a dask array if points_to_mask is lazy). + here. If in_place is True then array must be a + :class:`numpy.ma.MaskedArray` or :class:`dask.array.Array` + (must be a dask array if points_to_mask is lazy). """ # Decide which array library to use. @@ -1978,7 +1979,7 @@ def is_masked(array): Parameters ---------- - array : :class:`numpy.Array` or `dask.array.Array` + array : :class:`numpy.Array` or :class:`dask.array.Array` The array to be checked for masks. Returns