Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔧 Integrate doctest to run on our ragged.py module #381

Merged
merged 13 commits into from
Apr 11, 2024
21 changes: 19 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
test:
unittest:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
Expand All @@ -33,7 +33,7 @@ jobs:
cartopy
- name: Run unit tests
shell: bash -l {0}
run: coverage run -m unittest discover -s tests -p "*.py"
run: coverage run -m unittest discover -s tests -p "*_tests.py"
- name: Create coverage report
shell: bash -l {0}
run: |
Expand All @@ -43,3 +43,20 @@ jobs:
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
doctest:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python-version: ["3.10"]
steps:
- uses: actions/checkout@v4
- uses: mamba-org/setup-micromamba@v1
with:
environment-file: environment.yml
environment-name: clouddrift
- name: Run doc tests
shell: bash -l {0}
run: |
python -m unittest tests/docexamples.py
12 changes: 6 additions & 6 deletions clouddrift/adapters/gdp1h.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,11 +610,11 @@ def to_raggedarray(

# set dynamic global attributes
if ra.attrs_global:
ra.attrs_global["time_coverage_start"] = (
f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)
ra.attrs_global["time_coverage_end"] = (
f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)
ra.attrs_global[
"time_coverage_start"
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
ra.attrs_global[
"time_coverage_end"
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"

return ra
146 changes: 98 additions & 48 deletions clouddrift/ragged.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,29 @@ def apply_ragged(
multiple particles, the coordinates of which are found in the ragged arrays x, y, and t
that share row sizes 2, 3, and 4:

>>> from clouddrift.kinematics import velocity_from_position
>>> rowsize = [2, 3, 4]
>>> x = np.array([1, 2, 10, 12, 14, 30, 33, 36, 39])
>>> y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
>>> t = np.array([1, 2, 1, 2, 3, 1, 2, 3, 4])
>>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, coord_system="cartesian")
array([1., 1., 2., 2., 2., 3., 3., 3., 3.]),
array([1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> u1
array([1., 1., 2., 2., 2., 3., 3., 3., 3.])
>>> v1
array([1., 1., 1., 1., 1., 1., 1., 1., 1.])

To apply ``func`` to only a subset of rows, use the ``rows`` argument:

>>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, rows=0, coord_system="cartesian")
array([1., 1.]),
array([1., 1.]))
>>> u1
array([1., 1.])
>>> v1
array([1., 1.])
>>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, rows=[0, 1], coord_system="cartesian")
array([1., 1., 2., 2., 2.]),
array([1., 1., 1., 1., 1.]))
>>> u1
array([1., 1., 2., 2., 2.])
>>> v1
array([1., 1., 1., 1., 1.])

Raises
------
Expand Down Expand Up @@ -285,8 +292,10 @@ def prune(

Examples
--------
>>> from clouddrift.ragged import prune
>>> import numpy as np
>>> prune(np.array([1, 2, 3, 0, -1, -2]), np.array([3, 1, 2]),2)
(array([1, 2, 3, -1, -2]), array([3, 2]))
(array([ 1, 2, 3, -1, -2]), array([3, 2]))

Raises
------
Expand Down Expand Up @@ -357,9 +366,9 @@ def ragged_to_regular(
You can specify an alternative fill value:

>>> ragged_to_regular(np.array([1, 2, 3, 4, 5]), np.array([2, 1, 2]), fill_value=999)
array([[ 1., 2.],
[ 3., -999.],
[ 4., 5.]])
array([[ 1, 2],
[ 3, 999],
[ 4, 5]])

See Also
--------
Expand Down Expand Up @@ -401,7 +410,7 @@ def regular_to_ragged(
Alternatively, a different fill value can be specified:

>>> regular_to_ragged(np.array([[1, 2], [3, -999], [4, 5]]), fill_value=-999)
(array([1., 2., 3., 4., 5.]), array([2, 1, 2]))
(array([1, 2, 3, 4, 5]), array([2, 1, 2]))

See Also
--------
Expand Down Expand Up @@ -435,7 +444,7 @@ def rowsize_to_index(rowsize: list | np.ndarray | xr.DataArray) -> np.ndarray:
To obtain the indices within a ragged array of three consecutive rows of sizes 100, 202, and 53:

>>> rowsize_to_index([100, 202, 53])
array([0, 100, 302, 355])
array([ 0, 100, 302, 355])
"""
return np.cumsum(np.insert(np.array(rowsize), 0, 0))

Expand Down Expand Up @@ -468,6 +477,8 @@ def segment(
--------
The simplest use of ``segment`` is to provide a tolerance value that is
used to divide an array into segments:
>>> from clouddrift.ragged import segment, subset
>>> import numpy as np

>>> x = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4]
>>> segment(x, 0.5)
Expand Down Expand Up @@ -502,10 +513,10 @@ def segment(
If the input array contains time objects, the tolerance must be a time interval:

>>> x = np.array([np.datetime64("2023-01-01"), np.datetime64("2023-01-02"),
np.datetime64("2023-01-03"), np.datetime64("2023-02-01"),
np.datetime64("2023-02-02")])
... np.datetime64("2023-01-03"), np.datetime64("2023-02-01"),
... np.datetime64("2023-02-02")])
>>> segment(x, np.timedelta64(1, "D"))
np.array([3, 2])
array([3, 2])
"""

# for compatibility with datetime list or np.timedelta64 arrays
Expand Down Expand Up @@ -590,41 +601,67 @@ def subset(
a single drifter trajectory and the `row_dim_name` is "traj" and the `obs_dim_name` is "obs".

Retrieve a region, like the Gulf of Mexico, using ranges of latitude and longitude:
>>> from clouddrift.datasets import gdp6h
>>> from clouddrift.ragged import subset
>>> import numpy as np

>>> ds = gdp6h()
...

>>> subset(ds, {"lat": (21, 31), "lon": (-98, -78)}, row_dim_name="traj")
<xarray.Dataset> ...
...

The parameter `full_rows` can be used to retrieve trajectories passing through a region, for example all trajectories passing through the Gulf of Mexico:

>>> subset(ds, {"lat": (21, 31), "lon": (-98, -78)}, full_rows=True, row_dim_name="traj")
<xarray.Dataset> ...
...

Retrieve drogued trajectory segments:

>>> subset(ds, {"drogue_status": True}, row_dim_name="traj")
<xarray.Dataset> ...
Dimensions: (traj: ..., obs: ...)
Coordinates:
id (traj) int64 ...
time (obs) datetime64[ns] ...
...

Retrieve trajectory segments with temperature higher than 25°C (303.15K):

>>> subset(ds, {"sst": (303.15, np.inf)}, row_dim_name="traj")
>>> subset(ds, {"temp": (303.15, np.inf)}, row_dim_name="traj")
<xarray.Dataset> ...
...

You can use the same approach to return only the trajectories that are
shorter than some number of observations (similar to :func:`prune` but for
the entire dataset):

>>> subset(ds, {"rowsize": (0, 1000)}, row_dim_name="traj")
<xarray.Dataset> ...
...

Retrieve specific drifters using their IDs:

>>> subset(ds, {"id": [2578, 2582, 2583]}, row_dim_name="traj")
<xarray.Dataset> ...
...

Sometimes, you may want to retrieve specific rows of a ragged array.
You can do that by filtering along the trajectory dimension directly, since
this one corresponds to row numbers:

>>> rows = [5, 6, 7]
>>> subset(ds, {"traj": rows}, row_dim_name="traj")
<xarray.Dataset> ...
...

Retrieve a specific time period:

>>> subset(ds, {"time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
<xarray.Dataset> ...
...

Note that to subset time variable, the range has to be defined as a function
type of the variable. By default, ``xarray`` uses ``np.datetime64`` to
Expand All @@ -633,27 +670,37 @@ def subset(

Those criteria can also be combined:

>>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "sst": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
>>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "temp": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))}, row_dim_name="traj")
<xarray.Dataset> ...
...

You can also use a function to filter the data. For example, retrieve every other observation
of each trajectory:

>>> func = (lambda arr: ((arr - arr[0]) % 2) == 0)
>>> subset(ds, {"time": func}, row_dim_name="traj")
>>> subset(ds, {"id": func}, row_dim_name="traj")
<xarray.Dataset> ...
...

The filtering function can accept several input variables passed as a tuple. For example, retrieve
drifters released in the Mediterranean Sea, but exclude those released in the Bay of Biscay and the Black Sea:

>>> def mediterranean_mask(lon: xr.DataArray, lat: xr.DataArray) -> xr.DataArray:
>>> # Mediterranean Sea bounding box
>>> in_med = np.logical_and(-6.0327 <= lon, np.logical_and(lon <= 36.2173,
>>> np.logical_and(30.2639 <= lat, lat <= 45.7833)))
>>> # Bay of Biscay
>>> in_biscay = np.logical_and(lon <= -0.1462, lat >= 43.2744)
>>> # Black Sea
>>> in_blacksea = np.logical_and(lon >= 27.4437, lat >= 40.9088)
>>> return np.logical_and(in_med, np.logical_not(np.logical_or(in_biscay, in_blacksea)))
... # Mediterranean Sea bounding box
... in_med = np.logical_and(-6.0327 <= lon, np.logical_and(lon <= 36.2173,
... np.logical_and(30.2639 <= lat, lat <= 45.7833)))
... # Bay of Biscay
... in_biscay = np.logical_and(lon <= -0.1462, lat >= 43.2744)
... # Black Sea
... in_blacksea = np.logical_and(lon >= 27.4437, lat >= 40.9088)
... return np.logical_and(in_med, np.logical_not(np.logical_or(in_biscay, in_blacksea)))
>>> subset(ds, {("start_lon", "start_lat"): mediterranean_mask}, row_dim_name="traj")
<xarray.Dataset> Size: ...
Dimensions: (traj: ..., obs: ...)
Coordinates:
id (traj) int64 ...
time (obs) datetime64[ns] ...
...

Raises
------
Expand Down Expand Up @@ -773,25 +820,27 @@ def unpack(
--------

Unpacking longitude arrays from a ragged Xarray Dataset:
>>> from clouddrift.ragged import unpack
>>> from clouddrift.datasets import gdp6h

.. code-block:: python
>>> ds = gdp6h()

lon = unpack(ds.lon, ds["rowsize"]) # return a list[xr.DataArray] (slower)
lon = unpack(ds.lon.values, ds["rowsize"]) # return a list[np.ndarray] (faster)
first_lon = unpack(ds.lon.values, ds["rowsize"], rows=0) # return only the first row
first_two_lons = unpack(ds.lon.values, ds["rowsize"], rows=[0, 1]) # return first two rows
>>> lon = unpack(ds.lon, ds["rowsize"]) # return a list[xr.DataArray] (slower)
>>> lon = unpack(ds.lon.values, ds["rowsize"]) # return a list[np.ndarray] (faster)
>>> first_lon = unpack(ds.lon.values, ds["rowsize"], rows=0) # return only the first row
>>> first_two_lons = unpack(ds.lon.values, ds["rowsize"], rows=[0, 1]) # return first two rows

Looping over trajectories in a ragged Xarray Dataset to compute velocities
for each:

.. code-block:: python
>>> from clouddrift.kinematics import velocity_from_position

for lon, lat, time in list(zip(
unpack(ds.lon.values, ds["rowsize"]),
unpack(ds.lat.values, ds["rowsize"]),
unpack(ds.time.values, ds["rowsize"])
)):
u, v = velocity_from_position(lon, lat, time)
>>> for lon, lat, time in list(zip(
... unpack(ds.lon.values, ds["rowsize"]),
... unpack(ds.lat.values, ds["rowsize"]),
... unpack(ds.time.values, ds["rowsize"])
... )):
... u, v = velocity_from_position(lon, lat, time)
"""
indices = rowsize_to_index(rowsize)

Expand Down Expand Up @@ -830,33 +879,34 @@ def _mask_var(

Examples
--------
>>> import xarray as xr
>>> from clouddrift.ragged import _mask_var

>>> x = xr.DataArray(data=np.arange(0, 5))
>>> _mask_var(x, (2, 4))
<xarray.DataArray (dim_0: 5)>
<xarray.DataArray (dim_0: 5)> ...
array([False, False, True, True, True])
Dimensions without coordinates: dim_0

>>> _mask_var(x, [0, 2, 4])
<xarray.DataArray (dim_0: 5)>
array([ True, False, True, False, True])
Dimensions without coordinates: dim_0
array([ True, False, True, False, True])

>>> _mask_var(x, 4)
<xarray.DataArray (dim_0: 5)>
array([False, False, False, True, False])
<xarray.DataArray (dim_0: 5)> ...
array([False, False, False, False, True])
Dimensions without coordinates: dim_0

>>> rowsize = xr.DataArray(data=[2, 3])
>>> _mask_var(x, lambda arr: arr==arr[0]+1, rowsize, "dim_0")
<xarray.DataArray (dim_0: 5)>
array([False, True, False, True, False])
<xarray.DataArray (dim_0: 5)> ...
array([False, True, False, True, False])
Dimensions without coordinates: dim_0

>>> y = xr.DataArray(data=np.arange(0, 5)+2)
>>> rowsize = xr.DataArray(data=[2, 3])
>>> _mask_var([x, y], lambda var1, var2: ((var1 * var2) % 2) == 0, rowsize, "dim_0")
<xarray.DataArray (dim_0: 5)>
array([True, False, True, False, True])
<xarray.DataArray (dim_0: 5)> ...
array([ True, False, True, False, True])
Dimensions without coordinates: dim_0

Returns
Expand Down
13 changes: 13 additions & 0 deletions tests/docexamples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import doctest

import clouddrift.ragged as ragged


def load_tests(loader, tests, ignore):
tests.addTests(
doctest.DocTestSuite(
ragged,
optionflags=doctest.ELLIPSIS | doctest.FAIL_FAST,
)
)
return tests
Loading