Skip to content

Commit

Permalink
Datasets consistency (#336)
Browse files Browse the repository at this point in the history
* Cast time to float64 in datasets

* Only convert if we're not in floats already

* Adjust pairs tests now that the GLAD time is in seconds

* Bump patch version

* Consistent id(traj) coordinate in gdp1h and gdp6h

* Default ID->id in ragged.subset

* Allow decode_times

* Fix tests

* Fix pairs tests back to
  • Loading branch information
milancurcic authored Dec 18, 2023
1 parent 2e47a3d commit 6f7d23e
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 77 deletions.
129 changes: 94 additions & 35 deletions clouddrift/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,28 @@
This module provides functions to easily access ragged array datasets. If the datasets are
not accessed via cloud storage platforms or are not found on the local filesystem,
they will be downloaded from their upstream repositories and stored for later access
(~/.clouddrift for unix-based systems).
(~/.clouddrift for UNIX-based systems).
"""

from clouddrift import adapters
import os
import xarray as xr


def gdp1h() -> xr.Dataset:
def gdp1h(decode_times: bool = True) -> xr.Dataset:
"""Returns the latest version of the NOAA Global Drifter Program (GDP) hourly
dataset as a ragged array Xarray dataset.
The data is accessed from zarr archive hosted on a public AWS S3 bucket accessible at
https://registry.opendata.aws/noaa-oar-hourly-gdp/. Original data source from NOAA NCEI
is https://doi.org/10.25921/x46c-3620).
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand All @@ -31,7 +37,7 @@ def gdp1h() -> xr.Dataset:
<xarray.Dataset>
Dimensions: (traj: 19396, obs: 197214787)
Coordinates:
ids (obs) int64 ...
id (traj) int64 ...
time (obs) datetime64[ns] ...
Dimensions without coordinates: traj, obs
Data variables: (12/60)
Expand Down Expand Up @@ -68,17 +74,26 @@ def gdp1h() -> xr.Dataset:
:func:`gdp6h`
"""
url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp-v2.01.zarr"
return xr.open_dataset(url, engine="zarr")
ds = xr.open_dataset(url, engine="zarr", decode_times=decode_times)
ds = ds.rename_vars({"ID": "id"}).assign_coords({"id": ds.ID}).drop_vars(["ids"])
return ds


def gdp6h() -> xr.Dataset:
def gdp6h(decode_times: bool = True) -> xr.Dataset:
"""Returns the NOAA Global Drifter Program (GDP) 6-hourly dataset as a ragged array
Xarray dataset.
The data is accessed from a public HTTPS server at NOAA's Atlantic
Oceanographic and Meteorological Laboratory (AOML) accessible at
https://www.aoml.noaa.gov/phod/gdp/index.php.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand All @@ -92,13 +107,12 @@ def gdp6h() -> xr.Dataset:
<xarray.Dataset>
Dimensions: (traj: 26843, obs: 44544647)
Coordinates:
ids (obs) int64 ...
id (traj) int64 ...
time (obs) datetime64[ns] ...
lon (obs) float32 ...
lat (obs) float32 ...
Dimensions without coordinates: traj, obs
Data variables: (12/44)
ID (traj) int64 ...
rowsize (traj) int32 ...
WMO (traj) int32 ...
expno (traj) int32 ...
Expand Down Expand Up @@ -131,19 +145,28 @@ def gdp6h() -> xr.Dataset:
:func:`gdp1h`
"""
url = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/gdp_jul22_ragged_6h.nc#mode=bytes"
return xr.open_dataset(url)
ds = xr.open_dataset(url, decode_times=decode_times)
ds = ds.rename_vars({"ID": "id"}).assign_coords({"id": ds.ID}).drop_vars(["ids"])
return ds


def glad() -> xr.Dataset:
def glad(decode_times: bool = True) -> xr.Dataset:
"""Returns the Grand LAgrangian Deployment (GLAD) dataset as a ragged array
Xarray dataset.
Xarray dataset.
The function will first look for the ragged-array dataset on the local
filesystem. If it is not found, the dataset will be downloaded using the
corresponding adapter function and stored for later access.
The upstream data is available at https://doi.org/10.7266/N7VD6WC8.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand All @@ -157,8 +180,8 @@ def glad() -> xr.Dataset:
<xarray.Dataset>
Dimensions: (obs: 1602883, traj: 297)
Coordinates:
* time (obs) datetime64[ns] 2012-07-20T01:15:00.143960 ... 2012-...
* id (traj) object 'CARTHE_001' 'CARTHE_002' ... 'CARTHE_451'
time (obs) datetime64[ns] ...
id (traj) object ...
Data variables:
latitude (obs) float32 ...
longitude (obs) float32 ...
Expand Down Expand Up @@ -190,11 +213,11 @@ def glad() -> xr.Dataset:
os.makedirs(os.path.dirname(glad_path), exist_ok=True)
ds.to_netcdf(glad_path)
else:
ds = xr.open_dataset(glad_path)
ds = xr.open_dataset(glad_path, decode_times=decode_times)
return ds


def mosaic() -> xr.Dataset:
def mosaic(decode_times: bool = True) -> xr.Dataset:
"""Returns the MOSAiC sea-ice drift dataset as a ragged array Xarray dataset.
The function will first look for the ragged-array dataset on the local
Expand All @@ -214,6 +237,13 @@ def mosaic() -> xr.Dataset:
for the Study of Arctic Climate (MOSAiC) expedition 2019 - 2021. Arctic Data Center.
doi:10.18739/A2KP7TS83.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand Down Expand Up @@ -257,16 +287,23 @@ def mosaic() -> xr.Dataset:
os.makedirs(os.path.dirname(mosaic_path), exist_ok=True)
ds.to_netcdf(mosaic_path)
else:
ds = xr.open_dataset(mosaic_path)
ds = xr.open_dataset(mosaic_path, decode_times=decode_times)
return ds


def spotters() -> xr.Dataset:
"""Returns the SOFAR ocean drifters ragged array dataset as an Xarray dataset.
def spotters(decode_times: bool = True) -> xr.Dataset:
"""Returns the Sofar Ocean Spotter drifters ragged array dataset as an Xarray dataset.
The data is accessed from a zarr archive hosted on a public AWS S3 bucket accessible
at https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand Down Expand Up @@ -304,10 +341,10 @@ def spotters() -> xr.Dataset:
title: Sofar Spotter Data Archive - Bulk Wave Parameters
"""
url = "https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr"
return xr.open_dataset(url, engine="zarr")
return xr.open_dataset(url, engine="zarr", decode_times=decode_times)


def subsurface_floats() -> xr.Dataset:
def subsurface_floats(decode_times: bool = True) -> xr.Dataset:
"""Returns the subsurface floats dataset as a ragged array Xarray dataset.
The data is accessed from a public HTTPS server at NOAA's Atlantic
Expand Down Expand Up @@ -335,6 +372,13 @@ def subsurface_floats() -> xr.Dataset:
compiled in a single Matlab data set. See here for more information on the variables
contained in these files.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand Down Expand Up @@ -390,23 +434,24 @@ def subsurface_floats() -> xr.Dataset:
print(f"{local_file} not found; download from upstream repository.")
ds = adapters.subsurface_floats.to_xarray()
else:
ds = xr.open_dataset(local_file)
ds = xr.open_dataset(local_file, decode_times=decode_times)
return ds


def yomaha() -> xr.Dataset:
def yomaha(decode_times: bool = True) -> xr.Dataset:
"""Returns the YoMaHa dataset as a ragged array Xarray dataset.
The function will first look for the ragged-array dataset on the local
filesystem. If it is not found, the dataset will be downloaded using the
corresponding adapter function and stored for later access. The upstream
data is available at http://apdrc.soest.hawaii.edu/projects/yomaha/.
Reference
---------
Lebedev, K. V., Yoshinari, H., Maximenko, N. A., & Hacker, P. W. (2007). Velocity data
assessed from trajectories of Argo floats at parking level and at the sea
surface. IPRC Technical Note, 4(2), 1-16.
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
Expand Down Expand Up @@ -449,6 +494,12 @@ def yomaha() -> xr.Dataset:
publisher_name: Asia-Pacific Data Research Center
publisher_url: http://apdrc.soest.hawaii.edu/index.php
license: Creative Commons Attribution 4.0 International License..
Reference
---------
Lebedev, K. V., Yoshinari, H., Maximenko, N. A., & Hacker, P. W. (2007). Velocity data
assessed from trajectories of Argo floats at parking level and at the sea
surface. IPRC Technical Note, 4(2), 1-16.
"""
clouddrift_path = (
os.path.expanduser("~/.clouddrift")
Expand All @@ -462,23 +513,25 @@ def yomaha() -> xr.Dataset:
os.makedirs(os.path.dirname(local_file), exist_ok=True)
ds.to_netcdf(local_file)
else:
ds = xr.open_dataset(local_file)
ds = xr.open_dataset(local_file, decode_times=decode_times)
return ds


def andro() -> xr.Dataset:
def andro(decode_times: bool = True) -> xr.Dataset:
"""Returns the ANDRO as a ragged array Xarray dataset.
The function will first look for the ragged-array dataset on the local
filesystem. If it is not found, the dataset will be downloaded using the
corresponding adapter function and stored for later access. The upstream
data is available at https://www.seanoe.org/data/00360/47077/.
Reference
---------
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
SEANOE. https://doi.org/10.17882/47077
Parameters
----------
decode_times : bool, optional
If True, decode the time coordinate into a datetime object. If False, the time
coordinate will be an int64 or float64 array of increments since the origin
time indicated in the units attribute. Default is True.
Returns
-------
xarray.Dataset
Expand Down Expand Up @@ -518,6 +571,12 @@ def andro() -> xr.Dataset:
publisher_name: SEANOE (SEA scieNtific Open data Edition)
publisher_url: https://www.seanoe.org/data/00360/47077/
license: freely available
Reference
---------
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
SEANOE. https://doi.org/10.17882/47077
"""
clouddrift_path = (
os.path.expanduser("~/.clouddrift")
Expand All @@ -531,5 +590,5 @@ def andro() -> xr.Dataset:
os.makedirs(os.path.dirname(local_file), exist_ok=True)
ds.to_netcdf(local_file)
else:
ds = xr.open_dataset(local_file)
ds = xr.open_dataset(local_file, decode_times=decode_times)
return ds
6 changes: 3 additions & 3 deletions clouddrift/ragged.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def segment(
def subset(
ds: xr.Dataset,
criteria: dict,
id_var_name: str = "ID",
id_var_name: str = "id",
rowsize_var_name: str = "rowsize",
traj_dim_name: str = "traj",
obs_dim_name: str = "obs",
Expand All @@ -561,7 +561,7 @@ def subset(
criteria : dict
dictionary containing the variables and the ranges/values to subset
id_var_name : str, optional
Name of the variable containing the ID of the trajectories (default is "ID")
Name of the variable containing the ID of the trajectories (default is "id")
rowsize_var_name : str, optional
Name of the variable containing the number of observations per trajectory (default is "rowsize")
traj_dim_name : str, optional
Expand Down Expand Up @@ -607,7 +607,7 @@ def subset(
Retrieve specific drifters from their IDs:
>>> subset(ds, {"ID": [2578, 2582, 2583]})
>>> subset(ds, {"id": [2578, 2582, 2583]})
Sometimes, you may want to retrieve specific rows of a ragged array.
You can do that by filtering along the trajectory dimension directly, since
Expand Down
6 changes: 3 additions & 3 deletions docs/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ Currently available datasets are:
hosted by NOAA AOML at
`NOAA's Atlantic Oceanographic and Meteorological Laboratory (AOML) <https://www.aoml.noaa.gov/phod/float_traj/index.php>_`
and maintained by Andree Ramsey and Heather Furey from the Woods Hole Oceanographic Institution.
- :func:`clouddrift.datasets.spotters`: The SOFAR ocean spotters archive dataset as hosted at the public `AWS S3 bucket <https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr>`_.
- :func:`clouddrift.datasets.spotters`: The Sofar Ocean Spotters archive dataset as hosted at the public `AWS S3 bucket <https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr>`_.
- :func:`clouddrift.datasets.yomaha`: The YoMaHa'07 dataset as a ragged array
processed from the upstream dataset hosted at the `Asia-Pacific Data-Research
Center (APDRC) <http://apdrc.soest.hawaii.edu/projects/yomaha/>`_.

The GDP nd the spotters datasets are accessed lazily, so the data is only downloaded when
specific array values are referenced. The ANDRO, GLAD, MOSAiC, Subsurface floats, and YoMaHa'07
The GDP and the Spotters datasets are accessed lazily, so the data is only downloaded when
specific array values are referenced. The ANDRO, GLAD, MOSAiC, Subsurface Floats, and YoMaHa'07
datasets are downloaded in their entirety when the function is called for the first
time and stored locally for later use.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "clouddrift"
version = "0.30.0"
version = "0.29.0"
authors = [
{ name="Shane Elipot", email="selipot@miami.edu" },
{ name="Philippe Miron", email="philippemiron@gmail.com" },
Expand Down
6 changes: 3 additions & 3 deletions tests/datasets_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@


class datasets_tests(unittest.TestCase):
def test_gdp1h_opens(self):
def test_gdp1h(self):
ds = datasets.gdp1h()
self.assertTrue(ds)

def test_gdp6h_opens(self):
def test_gdp6h(self):
ds = datasets.gdp6h()
self.assertTrue(ds)

def test_glad_opens(self):
def test_glad(self):
ds = datasets.glad()
self.assertTrue(ds)

Expand Down
Loading

0 comments on commit 6f7d23e

Please sign in to comment.