Cloud-Drift · kevinsantana11 · Apr 11, 2024 · Mar 12, 2024 · Mar 17, 2024 · Mar 17, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,7 +13,7 @@ on:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  test:
+  unittest:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -33,7 +33,7 @@ jobs:
             cartopy
       - name: Run unit tests
         shell: bash -l {0}
-        run: coverage run -m unittest discover -s tests -p "*.py"
+        run: coverage run -m unittest discover -s tests -p "*_tests.py"
       - name: Create coverage report
         shell: bash -l {0}
         run: |
@@ -43,3 +43,20 @@ jobs:
         uses: codecov/codecov-action@v3
         with:
           file: ./coverage.xml
+  doctest:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"]
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: environment.yml
+          environment-name: clouddrift
+      - name: Run doc tests
+        shell: bash -l {0}
+        run: |
+              python -m unittest tests/docexamples.py
diff --git a/clouddrift/adapters/gdp1h.py b/clouddrift/adapters/gdp1h.py
@@ -610,11 +610,11 @@ def to_raggedarray(
 
     # set dynamic global attributes
     if ra.attrs_global:
-        ra.attrs_global["time_coverage_start"] = (
-            f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
-        )
-        ra.attrs_global["time_coverage_end"] = (
-            f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
-        )
+        ra.attrs_global[
+            "time_coverage_start"
+        ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+        ra.attrs_global[
+            "time_coverage_end"
+        ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
 
     return ra
diff --git a/clouddrift/ragged.py b/clouddrift/ragged.py
@@ -77,22 +77,29 @@ def apply_ragged(
     multiple particles, the coordinates of which are found in the ragged arrays x, y, and t
     that share row sizes 2, 3, and 4:
 
+    >>> from clouddrift.kinematics import velocity_from_position
     >>> rowsize = [2, 3, 4]
     >>> x = np.array([1, 2, 10, 12, 14, 30, 33, 36, 39])
     >>> y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
     >>> t = np.array([1, 2, 1, 2, 3, 1, 2, 3, 4])
     >>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, coord_system="cartesian")
-    array([1., 1., 2., 2., 2., 3., 3., 3., 3.]),
-    array([1., 1., 1., 1., 1., 1., 1., 1., 1.]))
+    >>> u1
+    array([1., 1., 2., 2., 2., 3., 3., 3., 3.])
+    >>> v1
+    array([1., 1., 1., 1., 1., 1., 1., 1., 1.])
 
     To apply ``func`` to only a subset of rows, use the ``rows`` argument:
 
     >>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, rows=0, coord_system="cartesian")
-    array([1., 1.]),
-    array([1., 1.]))
+    >>> u1
+    array([1., 1.])
+    >>> v1
+    array([1., 1.])
     >>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, rows=[0, 1], coord_system="cartesian")
-    array([1., 1., 2., 2., 2.]),
-    array([1., 1., 1., 1., 1.]))
+    >>> u1
+    array([1., 1., 2., 2., 2.])
+    >>> v1
+    array([1., 1., 1., 1., 1.])
 
     Raises
     ------
@@ -285,8 +292,10 @@ def prune(
 
     Examples
     --------
+    >>> from clouddrift.ragged import prune
+    >>> import numpy as np
     >>> prune(np.array([1, 2, 3, 0, -1, -2]), np.array([3, 1, 2]),2)
-    (array([1, 2, 3, -1, -2]), array([3, 2]))
+    (array([ 1,  2,  3, -1, -2]), array([3, 2]))
 
     Raises
     ------
@@ -357,9 +366,9 @@ def ragged_to_regular(
     You can specify an alternative fill value:
 
     >>> ragged_to_regular(np.array([1, 2, 3, 4, 5]), np.array([2, 1, 2]), fill_value=999)
-    array([[ 1.,    2.],
-           [ 3., -999.],
-           [ 4.,    5.]])
+    array([[  1,   2],
+           [  3, 999],
+           [  4,   5]])
 
     See Also
     --------
@@ -401,7 +410,7 @@ def regular_to_ragged(
     Alternatively, a different fill value can be specified:
 
     >>> regular_to_ragged(np.array([[1, 2], [3, -999], [4, 5]]), fill_value=-999)
-    (array([1., 2., 3., 4., 5.]), array([2, 1, 2]))
+    (array([1, 2, 3, 4, 5]), array([2, 1, 2]))
 
     See Also
     --------
@@ -435,7 +444,7 @@ def rowsize_to_index(rowsize: list | np.ndarray | xr.DataArray) -> np.ndarray:
     To obtain the indices within a ragged array of three consecutive rows of sizes 100, 202, and 53:
 
     >>> rowsize_to_index([100, 202, 53])
-    array([0, 100, 302, 355])
+    array([  0, 100, 302, 355])
     """
     return np.cumsum(np.insert(np.array(rowsize), 0, 0))
 
@@ -468,6 +477,8 @@ def segment(
     --------
     The simplest use of ``segment`` is to provide a tolerance value that is
     used to divide an array into segments:
+    >>> from clouddrift.ragged import segment, subset
+    >>> import numpy as np
 
     >>> x = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4]
     >>> segment(x, 0.5)
@@ -502,10 +513,10 @@ def segment(
     If the input array contains time objects, the tolerance must be a time interval:
 
     >>> x = np.array([np.datetime64("2023-01-01"), np.datetime64("2023-01-02"),
-                      np.datetime64("2023-01-03"), np.datetime64("2023-02-01"),
-                      np.datetime64("2023-02-02")])
+    ...               np.datetime64("2023-01-03"), np.datetime64("2023-02-01"),
+    ...               np.datetime64("2023-02-02")])
     >>> segment(x, np.timedelta64(1, "D"))
-    np.array([3, 2])
+    array([3, 2])
     """
 
     # for compatibility with datetime list or np.timedelta64 arrays
@@ -590,41 +601,67 @@ def subset(
     a single drifter trajectory and the `row_dim_name` is "traj" and the `obs_dim_name` is "obs".
 
     Retrieve a region, like the Gulf of Mexico, using ranges of latitude and longitude:
+    >>> from clouddrift.datasets import gdp6h
+    >>> from clouddrift.ragged import subset
+    >>> import numpy as np
+
+    >>> ds = gdp6h()
+    ...
 
     >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78)}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     The parameter `full_rows` can be used to retrieve trajectories passing through a region, for example all trajectories passing through the Gulf of Mexico:
 
     >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78)}, full_rows=True, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     Retrieve drogued trajectory segments:
 
     >>> subset(ds, {"drogue_status": True}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    Dimensions:                (traj: ..., obs: ...)
+    Coordinates:
+        id                     (traj) int64 ...
+        time                   (obs) datetime64[ns] ...
+    ...
 
     Retrieve trajectory segments with temperature higher than 25°C (303.15K):
 
-    >>> subset(ds, {"sst": (303.15, np.inf)}, row_dim_name="traj")
+    >>> subset(ds, {"temp": (303.15, np.inf)}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     You can use the same approach to return only the trajectories that are
     shorter than some number of observations (similar to :func:`prune` but for
     the entire dataset):
 
     >>> subset(ds, {"rowsize": (0, 1000)}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     Retrieve specific drifters using their IDs:
 
     >>> subset(ds, {"id": [2578, 2582, 2583]}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     Sometimes, you may want to retrieve specific rows of a ragged array.
     You can do that by filtering along the trajectory dimension directly, since
     this one corresponds to row numbers:
 
     >>> rows = [5, 6, 7]
     >>> subset(ds, {"traj": rows}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     Retrieve a specific time period:
 
     >>> subset(ds, {"time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     Note that to subset time variable, the range has to be defined as a function
     type of the variable. By default, ``xarray`` uses ``np.datetime64`` to
@@ -633,27 +670,37 @@ def subset(
 
     Those criteria can also be combined:
 
-    >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "sst": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
+    >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "temp": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     You can also use a function to filter the data. For example, retrieve every other observation
     of each trajectory:
 
     >>> func = (lambda arr: ((arr - arr[0]) % 2) == 0)
-    >>> subset(ds, {"time": func}, row_dim_name="traj")
+    >>> subset(ds, {"id": func}, row_dim_name="traj")
+    <xarray.Dataset> ...
+    ...
 
     The filtering function can accept several input variables passed as a tuple. For example, retrieve
     drifters released in the Mediterranean Sea, but exclude those released in the Bay of Biscay and the Black Sea:
 
     >>> def mediterranean_mask(lon: xr.DataArray, lat: xr.DataArray) -> xr.DataArray:
-    >>>     # Mediterranean Sea bounding box
-    >>>     in_med = np.logical_and(-6.0327 <= lon, np.logical_and(lon <= 36.2173,
-    >>>                                                            np.logical_and(30.2639 <= lat, lat <= 45.7833)))
-    >>>     # Bay of Biscay
-    >>>     in_biscay = np.logical_and(lon <= -0.1462, lat >= 43.2744)
-    >>>     # Black Sea
-    >>>     in_blacksea = np.logical_and(lon >= 27.4437, lat >= 40.9088)
-    >>>     return np.logical_and(in_med, np.logical_not(np.logical_or(in_biscay, in_blacksea)))
+    ...    # Mediterranean Sea bounding box
+    ...    in_med = np.logical_and(-6.0327 <= lon, np.logical_and(lon <= 36.2173,
+    ...                                                           np.logical_and(30.2639 <= lat, lat <= 45.7833)))
+    ...    # Bay of Biscay
+    ...    in_biscay = np.logical_and(lon <= -0.1462, lat >= 43.2744)
+    ...    # Black Sea
+    ...    in_blacksea = np.logical_and(lon >= 27.4437, lat >= 40.9088)
+    ...    return np.logical_and(in_med, np.logical_not(np.logical_or(in_biscay, in_blacksea)))
     >>> subset(ds, {("start_lon", "start_lat"): mediterranean_mask}, row_dim_name="traj")
+    <xarray.Dataset> Size: ...
+    Dimensions:                (traj: ..., obs: ...)
+    Coordinates:
+        id                     (traj) int64 ...
+        time                   (obs) datetime64[ns] ...
+    ...
 
     Raises
     ------
@@ -773,25 +820,27 @@ def unpack(
     --------
 
     Unpacking longitude arrays from a ragged Xarray Dataset:
+    >>> from clouddrift.ragged import unpack
+    >>> from clouddrift.datasets import gdp6h
 
-    .. code-block:: python
+    >>> ds = gdp6h()
 
-        lon = unpack(ds.lon, ds["rowsize"]) # return a list[xr.DataArray] (slower)
-        lon = unpack(ds.lon.values, ds["rowsize"]) # return a list[np.ndarray] (faster)
-        first_lon = unpack(ds.lon.values, ds["rowsize"], rows=0) # return only the first row
-        first_two_lons = unpack(ds.lon.values, ds["rowsize"], rows=[0, 1]) # return first two rows
+    >>> lon = unpack(ds.lon, ds["rowsize"]) # return a list[xr.DataArray] (slower)
+    >>> lon = unpack(ds.lon.values, ds["rowsize"]) # return a list[np.ndarray] (faster)
+    >>> first_lon = unpack(ds.lon.values, ds["rowsize"], rows=0) # return only the first row
+    >>> first_two_lons = unpack(ds.lon.values, ds["rowsize"], rows=[0, 1]) # return first two rows
 
     Looping over trajectories in a ragged Xarray Dataset to compute velocities
     for each:
 
-    .. code-block:: python
+    >>> from clouddrift.kinematics import velocity_from_position
 
-        for lon, lat, time in list(zip(
-            unpack(ds.lon.values, ds["rowsize"]),
-            unpack(ds.lat.values, ds["rowsize"]),
-            unpack(ds.time.values, ds["rowsize"])
-        )):
-            u, v = velocity_from_position(lon, lat, time)
+    >>> for lon, lat, time in list(zip(
+    ...     unpack(ds.lon.values, ds["rowsize"]),
+    ...     unpack(ds.lat.values, ds["rowsize"]),
+    ...     unpack(ds.time.values, ds["rowsize"])
+    ... )):
+    ...     u, v = velocity_from_position(lon, lat, time)
     """
     indices = rowsize_to_index(rowsize)
 
@@ -830,33 +879,34 @@ def _mask_var(
 
     Examples
     --------
+    >>> import xarray as xr
+    >>> from clouddrift.ragged import _mask_var
+
     >>> x = xr.DataArray(data=np.arange(0, 5))
     >>> _mask_var(x, (2, 4))
-    <xarray.DataArray (dim_0: 5)>
+    <xarray.DataArray (dim_0: 5)> ...
     array([False, False,  True,  True,  True])
     Dimensions without coordinates: dim_0
 
     >>> _mask_var(x, [0, 2, 4])
-    <xarray.DataArray (dim_0: 5)>
-    array([ True, False, True,  False, True])
-    Dimensions without coordinates: dim_0
+    array([ True, False,  True, False,  True])
 
     >>> _mask_var(x, 4)
-    <xarray.DataArray (dim_0: 5)>
-    array([False, False, False,  True, False])
+    <xarray.DataArray (dim_0: 5)> ...
+    array([False, False, False, False,  True])
     Dimensions without coordinates: dim_0
 
     >>> rowsize = xr.DataArray(data=[2, 3])
     >>> _mask_var(x, lambda arr: arr==arr[0]+1, rowsize, "dim_0")
-    <xarray.DataArray (dim_0: 5)>
-    array([False, True, False,  True, False])
+    <xarray.DataArray (dim_0: 5)> ...
+    array([False,  True, False,  True, False])
     Dimensions without coordinates: dim_0
 
     >>> y = xr.DataArray(data=np.arange(0, 5)+2)
     >>> rowsize = xr.DataArray(data=[2, 3])
     >>> _mask_var([x, y], lambda var1, var2: ((var1 * var2) % 2) == 0, rowsize, "dim_0")
-    <xarray.DataArray (dim_0: 5)>
-    array([True, False, True,  False, True])
+    <xarray.DataArray (dim_0: 5)> ...
+    array([ True, False,  True, False,  True])
     Dimensions without coordinates: dim_0
 
     Returns

diff --git a/tests/docexamples.py b/tests/docexamples.py
@@ -0,0 +1,13 @@
+import doctest
+
+import clouddrift.ragged as ragged
+
+
+def load_tests(loader, tests, ignore):
+    tests.addTests(
+        doctest.DocTestSuite(
+            ragged,
+            optionflags=doctest.ELLIPSIS | doctest.FAIL_FAST,
+        )
+    )
+    return tests