diff --git a/clouddrift/datasets.py b/clouddrift/datasets.py index 8b4c4017..8f41355c 100644 --- a/clouddrift/datasets.py +++ b/clouddrift/datasets.py @@ -2,15 +2,14 @@ This module provides functions to easily access ragged array datasets. If the datasets are not accessed via cloud storage platforms or are not found on the local filesystem, they will be downloaded from their upstream repositories and stored for later access -(~/.clouddrift for unix-based systems). +(~/.clouddrift for UNIX-based systems). """ - from clouddrift import adapters import os import xarray as xr -def gdp1h() -> xr.Dataset: +def gdp1h(decode_times: bool = True) -> xr.Dataset: """Returns the latest version of the NOAA Global Drifter Program (GDP) hourly dataset as a ragged array Xarray dataset. @@ -18,6 +17,13 @@ def gdp1h() -> xr.Dataset: https://registry.opendata.aws/noaa-oar-hourly-gdp/. Original data source from NOAA NCEI is https://doi.org/10.25921/x46c-3620). + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -31,7 +37,7 @@ def gdp1h() -> xr.Dataset: Dimensions: (traj: 19396, obs: 197214787) Coordinates: - ids (obs) int64 ... + id (traj) int64 ... time (obs) datetime64[ns] ... Dimensions without coordinates: traj, obs Data variables: (12/60) @@ -68,10 +74,12 @@ def gdp1h() -> xr.Dataset: :func:`gdp6h` """ url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp-v2.01.zarr" - return xr.open_dataset(url, engine="zarr") + ds = xr.open_dataset(url, engine="zarr", decode_times=decode_times) + ds = ds.rename_vars({"ID": "id"}).assign_coords({"id": ds.ID}).drop_vars(["ids"]) + return ds -def gdp6h() -> xr.Dataset: +def gdp6h(decode_times: bool = True) -> xr.Dataset: """Returns the NOAA Global Drifter Program (GDP) 6-hourly dataset as a ragged array Xarray dataset. @@ -79,6 +87,13 @@ def gdp6h() -> xr.Dataset: Oceanographic and Meteorological Laboratory (AOML) accessible at https://www.aoml.noaa.gov/phod/gdp/index.php. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -92,13 +107,12 @@ def gdp6h() -> xr.Dataset: Dimensions: (traj: 26843, obs: 44544647) Coordinates: - ids (obs) int64 ... + id (traj) int64 ... time (obs) datetime64[ns] ... lon (obs) float32 ... lat (obs) float32 ... Dimensions without coordinates: traj, obs Data variables: (12/44) - ID (traj) int64 ... rowsize (traj) int32 ... WMO (traj) int32 ... expno (traj) int32 ... @@ -131,12 +145,14 @@ def gdp6h() -> xr.Dataset: :func:`gdp1h` """ url = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/gdp_jul22_ragged_6h.nc#mode=bytes" - return xr.open_dataset(url) + ds = xr.open_dataset(url, decode_times=decode_times) + ds = ds.rename_vars({"ID": "id"}).assign_coords({"id": ds.ID}).drop_vars(["ids"]) + return ds -def glad() -> xr.Dataset: +def glad(decode_times: bool = True) -> xr.Dataset: """Returns the Grand LAgrangian Deployment (GLAD) dataset as a ragged array - Xarray dataset. + Xarray dataset. The function will first look for the ragged-array dataset on the local filesystem. If it is not found, the dataset will be downloaded using the @@ -144,6 +160,13 @@ def glad() -> xr.Dataset: The upstream data is available at https://doi.org/10.7266/N7VD6WC8. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -157,8 +180,8 @@ def glad() -> xr.Dataset: Dimensions: (obs: 1602883, traj: 297) Coordinates: - * time (obs) datetime64[ns] 2012-07-20T01:15:00.143960 ... 2012-... - * id (traj) object 'CARTHE_001' 'CARTHE_002' ... 'CARTHE_451' + time (obs) datetime64[ns] ... + id (traj) object ... Data variables: latitude (obs) float32 ... longitude (obs) float32 ... @@ -190,11 +213,11 @@ def glad() -> xr.Dataset: os.makedirs(os.path.dirname(glad_path), exist_ok=True) ds.to_netcdf(glad_path) else: - ds = xr.open_dataset(glad_path) + ds = xr.open_dataset(glad_path, decode_times=decode_times) return ds -def mosaic() -> xr.Dataset: +def mosaic(decode_times: bool = True) -> xr.Dataset: """Returns the MOSAiC sea-ice drift dataset as a ragged array Xarray dataset. The function will first look for the ragged-array dataset on the local @@ -214,6 +237,13 @@ def mosaic() -> xr.Dataset: for the Study of Arctic Climate (MOSAiC) expedition 2019 - 2021. Arctic Data Center. doi:10.18739/A2KP7TS83. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -257,16 +287,23 @@ def mosaic() -> xr.Dataset: os.makedirs(os.path.dirname(mosaic_path), exist_ok=True) ds.to_netcdf(mosaic_path) else: - ds = xr.open_dataset(mosaic_path) + ds = xr.open_dataset(mosaic_path, decode_times=decode_times) return ds -def spotters() -> xr.Dataset: - """Returns the SOFAR ocean drifters ragged array dataset as an Xarray dataset. +def spotters(decode_times: bool = True) -> xr.Dataset: + """Returns the Sofar Ocean Spotter drifters ragged array dataset as an Xarray dataset. The data is accessed from a zarr archive hosted on a public AWS S3 bucket accessible at https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -304,10 +341,10 @@ def spotters() -> xr.Dataset: title: Sofar Spotter Data Archive - Bulk Wave Parameters """ url = "https://sofar-spotter-archive.s3.amazonaws.com/spotter_data_bulk_zarr" - return xr.open_dataset(url, engine="zarr") + return xr.open_dataset(url, engine="zarr", decode_times=decode_times) -def subsurface_floats() -> xr.Dataset: +def subsurface_floats(decode_times: bool = True) -> xr.Dataset: """Returns the subsurface floats dataset as a ragged array Xarray dataset. The data is accessed from a public HTTPS server at NOAA's Atlantic @@ -335,6 +372,13 @@ def subsurface_floats() -> xr.Dataset: compiled in a single Matlab data set. See here for more information on the variables contained in these files. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -390,11 +434,11 @@ def subsurface_floats() -> xr.Dataset: print(f"{local_file} not found; download from upstream repository.") ds = adapters.subsurface_floats.to_xarray() else: - ds = xr.open_dataset(local_file) + ds = xr.open_dataset(local_file, decode_times=decode_times) return ds -def yomaha() -> xr.Dataset: +def yomaha(decode_times: bool = True) -> xr.Dataset: """Returns the YoMaHa dataset as a ragged array Xarray dataset. The function will first look for the ragged-array dataset on the local @@ -402,11 +446,12 @@ def yomaha() -> xr.Dataset: corresponding adapter function and stored for later access. The upstream data is available at http://apdrc.soest.hawaii.edu/projects/yomaha/. - Reference - --------- - Lebedev, K. V., Yoshinari, H., Maximenko, N. A., & Hacker, P. W. (2007). Velocity data - assessed from trajectories of Argo floats at parking level and at the sea - surface. IPRC Technical Note, 4(2), 1-16. + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. Returns ------- @@ -449,6 +494,12 @@ def yomaha() -> xr.Dataset: publisher_name: Asia-Pacific Data Research Center publisher_url: http://apdrc.soest.hawaii.edu/index.php license: Creative Commons Attribution 4.0 International License.. + + Reference + --------- + Lebedev, K. V., Yoshinari, H., Maximenko, N. A., & Hacker, P. W. (2007). Velocity data + assessed from trajectories of Argo floats at parking level and at the sea + surface. IPRC Technical Note, 4(2), 1-16. """ clouddrift_path = ( os.path.expanduser("~/.clouddrift") @@ -462,11 +513,11 @@ def yomaha() -> xr.Dataset: os.makedirs(os.path.dirname(local_file), exist_ok=True) ds.to_netcdf(local_file) else: - ds = xr.open_dataset(local_file) + ds = xr.open_dataset(local_file, decode_times=decode_times) return ds -def andro() -> xr.Dataset: +def andro(decode_times: bool = True) -> xr.Dataset: """Returns the ANDRO as a ragged array Xarray dataset. The function will first look for the ragged-array dataset on the local @@ -474,11 +525,13 @@ def andro() -> xr.Dataset: corresponding adapter function and stored for later access. The upstream data is available at https://www.seanoe.org/data/00360/47077/. - Reference - --------- - Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles, - Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset. - SEANOE. https://doi.org/10.17882/47077 + Parameters + ---------- + decode_times : bool, optional + If True, decode the time coordinate into a datetime object. If False, the time + coordinate will be an int64 or float64 array of increments since the origin + time indicated in the units attribute. Default is True. + Returns ------- xarray.Dataset @@ -518,6 +571,12 @@ def andro() -> xr.Dataset: publisher_name: SEANOE (SEA scieNtific Open data Edition) publisher_url: https://www.seanoe.org/data/00360/47077/ license: freely available + + Reference + --------- + Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles, + Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset. + SEANOE. https://doi.org/10.17882/47077 """ clouddrift_path = ( os.path.expanduser("~/.clouddrift") @@ -531,5 +590,5 @@ def andro() -> xr.Dataset: os.makedirs(os.path.dirname(local_file), exist_ok=True) ds.to_netcdf(local_file) else: - ds = xr.open_dataset(local_file) + ds = xr.open_dataset(local_file, decode_times=decode_times) return ds diff --git a/clouddrift/ragged.py b/clouddrift/ragged.py index 13a77148..98425e95 100644 --- a/clouddrift/ragged.py +++ b/clouddrift/ragged.py @@ -539,7 +539,7 @@ def segment( def subset( ds: xr.Dataset, criteria: dict, - id_var_name: str = "ID", + id_var_name: str = "id", rowsize_var_name: str = "rowsize", traj_dim_name: str = "traj", obs_dim_name: str = "obs", @@ -561,7 +561,7 @@ def subset( criteria : dict dictionary containing the variables and the ranges/values to subset id_var_name : str, optional - Name of the variable containing the ID of the trajectories (default is "ID") + Name of the variable containing the ID of the trajectories (default is "id") rowsize_var_name : str, optional Name of the variable containing the number of observations per trajectory (default is "rowsize") traj_dim_name : str, optional @@ -607,7 +607,7 @@ def subset( Retrieve specific drifters from their IDs: - >>> subset(ds, {"ID": [2578, 2582, 2583]}) + >>> subset(ds, {"id": [2578, 2582, 2583]}) Sometimes, you may want to retrieve specific rows of a ragged array. You can do that by filtering along the trajectory dimension directly, since diff --git a/docs/datasets.rst b/docs/datasets.rst index c59b189c..e12ec970 100644 --- a/docs/datasets.rst +++ b/docs/datasets.rst @@ -67,12 +67,12 @@ Currently available datasets are: hosted by NOAA AOML at `NOAA's Atlantic Oceanographic and Meteorological Laboratory (AOML) _` and maintained by Andree Ramsey and Heather Furey from the Woods Hole Oceanographic Institution. -- :func:`clouddrift.datasets.spotters`: The SOFAR ocean spotters archive dataset as hosted at the public `AWS S3 bucket `_. +- :func:`clouddrift.datasets.spotters`: The Sofar Ocean Spotters archive dataset as hosted at the public `AWS S3 bucket `_. - :func:`clouddrift.datasets.yomaha`: The YoMaHa'07 dataset as a ragged array processed from the upstream dataset hosted at the `Asia-Pacific Data-Research Center (APDRC) `_. -The GDP nd the spotters datasets are accessed lazily, so the data is only downloaded when -specific array values are referenced. The ANDRO, GLAD, MOSAiC, Subsurface floats, and YoMaHa'07 +The GDP and the Spotters datasets are accessed lazily, so the data is only downloaded when +specific array values are referenced. The ANDRO, GLAD, MOSAiC, Subsurface Floats, and YoMaHa'07 datasets are downloaded in their entirety when the function is called for the first time and stored locally for later use. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 24b2d529..73e9bda1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "clouddrift" -version = "0.30.0" +version = "0.29.0" authors = [ { name="Shane Elipot", email="selipot@miami.edu" }, { name="Philippe Miron", email="philippemiron@gmail.com" }, diff --git a/tests/datasets_tests.py b/tests/datasets_tests.py index 52edd0a5..68893a65 100644 --- a/tests/datasets_tests.py +++ b/tests/datasets_tests.py @@ -9,15 +9,15 @@ class datasets_tests(unittest.TestCase): - def test_gdp1h_opens(self): + def test_gdp1h(self): ds = datasets.gdp1h() self.assertTrue(ds) - def test_gdp6h_opens(self): + def test_gdp6h(self): ds = datasets.gdp6h() self.assertTrue(ds) - def test_glad_opens(self): + def test_glad(self): ds = datasets.glad() self.assertTrue(ds) diff --git a/tests/ragged_tests.py b/tests/ragged_tests.py index 1f3bf0e8..371d7340 100644 --- a/tests/ragged_tests.py +++ b/tests/ragged_tests.py @@ -42,7 +42,7 @@ def sample_ragged_array() -> RaggedArray: variables_coords = ["ids", "time", "lon", "lat"] coords = {"lon": longitude, "lat": latitude, "ids": ids, "time": t} - metadata = {"ID": drifter_id, "rowsize": rowsize} + metadata = {"id": drifter_id, "rowsize": rowsize} data = {"test": test} # append xr.Dataset to a list @@ -79,7 +79,7 @@ def sample_ragged_array() -> RaggedArray: [0, 1, 2], lambda i: list_ds[i], variables_coords, - ["ID", "rowsize"], + ["id", "rowsize"], ["test"], ) @@ -584,12 +584,12 @@ def test_ds_unmodified(self): def test_equal(self): ds_sub = subset(self.ds, {"test": True}) - self.assertEqual(len(ds_sub.ID), 2) + self.assertEqual(len(ds_sub.id), 2) def test_select(self): - ds_sub = subset(self.ds, {"ID": [1, 2]}) - self.assertTrue(all(ds_sub.ID == [1, 2])) - self.assertEqual(len(ds_sub.ID), 2) + ds_sub = subset(self.ds, {"id": [1, 2]}) + self.assertTrue(all(ds_sub.id == [1, 2])) + self.assertEqual(len(ds_sub.id), 2) def test_range(self): # positive @@ -608,28 +608,28 @@ def test_range(self): # negative range ds_sub = subset(self.ds, {"lon": (-180, 0)}) traj_idx = np.insert(np.cumsum(ds_sub["rowsize"].values), 0, 0) - self.assertEqual(len(ds_sub.ID), 1) - self.assertEqual(ds_sub.ID[0], 1) + self.assertEqual(len(ds_sub.id), 1) + self.assertEqual(ds_sub.id[0], 1) self.assertTrue(all(ds_sub.lon == [-121, -111])) # both ds_sub = subset(self.ds, {"lon": (-30, 30)}) traj_idx = np.insert(np.cumsum(ds_sub["rowsize"].values), 0, 0) - self.assertEqual(len(ds_sub.ID), 1) - self.assertEqual(ds_sub.ID[0], 2) + self.assertEqual(len(ds_sub.id), 1) + self.assertEqual(ds_sub.id[0], 2) self.assertTrue(all(ds_sub.lon[slice(traj_idx[0], traj_idx[1])] == ([12, 22]))) def test_combine(self): ds_sub = subset( - self.ds, {"ID": [1, 2], "lat": (-90, 20), "lon": (-180, 25), "test": True} + self.ds, {"id": [1, 2], "lat": (-90, 20), "lon": (-180, 25), "test": True} ) - self.assertTrue(all(ds_sub.ID == [1, 2])) + self.assertTrue(all(ds_sub.id == [1, 2])) self.assertTrue(all(ds_sub.lon == [-121, -111, 12])) self.assertTrue(all(ds_sub.lat == [-90, -45, 10])) def test_empty(self): - ds_sub = subset(self.ds, {"ID": 3, "lon": (-180, 0)}) - self.assertTrue(ds_sub.sizes == {}) + ds_sub = subset(self.ds, {"id": 3, "lon": (-180, 0)}) + self.assertTrue(ds_sub.dims == {}) def test_unknown_var(self): with self.assertRaises(ValueError): @@ -640,43 +640,43 @@ def test_unknown_var(self): def test_ragged_array_with_id_as_str(self): ds_str = self.ds.copy() - ds_str["ID"].values = ds_str["ID"].astype(str) + ds_str["id"].values = ds_str["id"].astype(str) - ds_sub = subset(ds_str, {"ID": ds_str["ID"].values[0]}) - self.assertTrue(ds_sub["ID"].size == 1) + ds_sub = subset(ds_str, {"id": ds_str["id"].values[0]}) + self.assertTrue(ds_sub["id"].size == 1) - ds_sub = subset(ds_str, {"ID": list(ds_str["ID"].values[:2])}) - self.assertTrue(ds_sub["ID"].size == 2) + ds_sub = subset(ds_str, {"id": list(ds_str["id"].values[:2])}) + self.assertTrue(ds_sub["id"].size == 2) def test_ragged_array_with_id_as_object(self): ds_str = self.ds.copy() - ds_str["ID"].values = ds_str["ID"].astype(object) + ds_str["id"].values = ds_str["id"].astype(object) - ds_sub = subset(ds_str, {"ID": ds_str["ID"].values[0]}) - self.assertTrue(ds_sub["ID"].size == 1) + ds_sub = subset(ds_str, {"id": ds_str["id"].values[0]}) + self.assertTrue(ds_sub["id"].size == 1) - ds_sub = subset(ds_str, {"ID": list(ds_str["ID"].values[:2])}) - self.assertTrue(ds_sub["ID"].size == 2) + ds_sub = subset(ds_str, {"id": list(ds_str["id"].values[:2])}) + self.assertTrue(ds_sub["id"].size == 2) def test_arraylike_criterion(self): # DataArray - ds_sub = subset(self.ds, {"ID": self.ds["ID"][:2]}) - self.assertTrue(ds_sub["ID"].size == 2) + ds_sub = subset(self.ds, {"id": self.ds["id"][:2]}) + self.assertTrue(ds_sub["id"].size == 2) # NumPy array - ds_sub = subset(self.ds, {"ID": self.ds["ID"][:2].values}) - self.assertTrue(ds_sub["ID"].size == 2) + ds_sub = subset(self.ds, {"id": self.ds["id"][:2].values}) + self.assertTrue(ds_sub["id"].size == 2) def test_full_trajectories(self): ds_id_rowsize = { - i: j for i, j in zip(self.ds.ID.values, self.ds.rowsize.values) + i: j for i, j in zip(self.ds.id.values, self.ds.rowsize.values) } ds_sub = subset(self.ds, {"lon": (-125, -111)}, full_trajectories=True) self.assertTrue(all(ds_sub.lon == [-121, -111, 51, 61, 71])) ds_sub_id_rowsize = { - i: j for i, j in zip(ds_sub.ID.values, ds_sub.rowsize.values) + i: j for i, j in zip(ds_sub.id.values, ds_sub.rowsize.values) } for k, v in ds_sub_id_rowsize.items(): self.assertTrue(ds_id_rowsize[k] == v) @@ -685,7 +685,7 @@ def test_full_trajectories(self): self.assertTrue(all(ds_sub.lat == [10, 20, 30, 40])) ds_sub_id_rowsize = { - i: j for i, j in zip(ds_sub.ID.values, ds_sub.rowsize.values) + i: j for i, j in zip(ds_sub.id.values, ds_sub.rowsize.values) } for k, v in ds_sub_id_rowsize.items(): self.assertTrue(ds_id_rowsize[k] == v) @@ -696,7 +696,7 @@ def test_full_trajectories(self): def test_subset_by_rows(self): rows = [0, 2] # test extracting first and third rows ds_sub = subset(self.ds, {"traj": rows}) - self.assertTrue(all(ds_sub["ID"] == [1, 2])) + self.assertTrue(all(ds_sub["id"] == [1, 2])) self.assertTrue(all(ds_sub["rowsize"] == [5, 4]))