Update gdp datasets (fix) (#354)

Update gdp datasets; centralize download logic and enhance with retry mechanism. closes #353
Cloud-Drift · Jan 20, 2024 · c438051 · c438051
1 parent e66db2d
commit c438051
Show file tree

Hide file tree

Showing 15 changed files with 286 additions and 205 deletions.
diff --git a/clouddrift/adapters/__init__.py b/clouddrift/adapters/__init__.py
@@ -8,10 +8,23 @@
 in the future.
 """
 
-import clouddrift.adapters.andro
-import clouddrift.adapters.gdp1h
-import clouddrift.adapters.gdp6h
-import clouddrift.adapters.glad
-import clouddrift.adapters.mosaic
-import clouddrift.adapters.subsurface_floats
-import clouddrift.adapters.yomaha
+import clouddrift.adapters.andro as andro
+import clouddrift.adapters.gdp1h as gdp1h
+import clouddrift.adapters.gdp6h as gdp6h
+import clouddrift.adapters.glad as glad
+import clouddrift.adapters.mosaic as mosaic
+import clouddrift.adapters.subsurface_floats as subsurface_floats
+import clouddrift.adapters.yomaha as yomaha
+import clouddrift.adapters.utils as utils
+
+
+__all__ = [
+    "andro",
+    "gdp1h",
+    "gdp6h",
+    "glad",
+    "mosaic",
+    "subsurface_floats",
+    "yomaha",
+    "utils",
+]
diff --git a/clouddrift/adapters/andro.py b/clouddrift/adapters/andro.py
@@ -17,7 +17,7 @@
 SEANOE. https://doi.org/10.17882/47077
 """
 
-from clouddrift.adapters.yomaha import download_with_progress
+from clouddrift.adapters.utils import download_with_progress
 from datetime import datetime
 import numpy as np
 import os
@@ -39,7 +39,7 @@ def to_xarray(tmp_path: str = None):
 
     # get or update dataset
     local_file = f"{tmp_path}/{ANDRO_URL.split('/')[-1]}"
-    download_with_progress(ANDRO_URL, local_file)
+    download_with_progress([(ANDRO_URL, local_file)])
 
     # parse with panda
     col_names = [

diff --git a/clouddrift/adapters/gdp.py b/clouddrift/adapters/gdp.py
@@ -5,12 +5,11 @@
 and six-hourly (``clouddrift.adapters.gdp6h``) GDP modules.
 """
 
+from clouddrift.adapters.utils import download_with_progress
 import numpy as np
 import os
 import pandas as pd
 import xarray as xr
-import urllib.request
-import warnings
 
 GDP_COORDS = [
     "ids",
@@ -188,10 +187,7 @@ def fetch_netcdf(url: str, file: str):
     file : str
         Name of the file to save.
     """
-    if not os.path.isfile(file):
-        urllib.request.urlretrieve(url, file)
-    else:
-        warnings.warn(f"{file} already exists; skip download.")
+    download_with_progress([(url, file)])
 
 
 def decode_date(t):

diff --git a/clouddrift/adapters/gdp1h.py b/clouddrift/adapters/gdp1h.py
@@ -6,21 +6,20 @@
 
 import clouddrift.adapters.gdp as gdp
 from clouddrift.raggedarray import RaggedArray
-from datetime import datetime
+from clouddrift.adapters.utils import download_with_progress
+from datetime import datetime, timedelta
 import numpy as np
 import urllib.request
-import concurrent.futures
 import re
 import tempfile
-from tqdm import tqdm
 from typing import Optional
 import os
 import warnings
 import xarray as xr
 
 GDP_VERSION = "2.01"
 
-GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/v2.01/netcdf/"
+GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01/"
 GDP_DATA_URL_EXPERIMENTAL = (
     "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
 )
@@ -108,25 +107,11 @@ def download(
             rng = np.random.RandomState(42)
             drifter_ids = sorted(rng.choice(drifter_ids, n_random_id, replace=False))
 
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        # create list of urls and paths
-        urls = []
-        files = []
-        for i in drifter_ids:
-            file = filename_pattern.format(id=i)
-            urls.append(os.path.join(url, file))
-            files.append(os.path.join(tmp_path, file))
-
-        # parallel retrieving of individual netCDF files
-        list(
-            tqdm(
-                executor.map(gdp.fetch_netcdf, urls, files),
-                total=len(files),
-                desc="Downloading files",
-                ncols=80,
-            )
-        )
-
+    download_requests = [
+        (os.path.join(url, file_name), os.path.join(tmp_path, file_name))
+        for file_name in map(lambda d_id: filename_pattern.format(id=d_id), drifter_ids)
+    ]
+    download_with_progress(download_requests)
     # Download the metadata so we can order the drifter IDs by end date.
     gdp_metadata = gdp.get_gdp_metadata()
 
@@ -490,6 +475,8 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         "title": "Global Drifter Program hourly drifting buoy collection",
         "history": f"version {GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
         "Conventions": "CF-1.6",
+        "time_coverage_start": "",
+        "time_coverage_end": "",
         "date_created": datetime.now().isoformat(),
         "publisher_name": "GDP Drifter DAC",
         "publisher_email": "aoml.dftr@noaa.gov",
@@ -602,7 +589,7 @@ def to_raggedarray(
     else:
         raise ValueError(f"url must be {GDP_DATA_URL} or {GDP_DATA_URL_EXPERIMENTAL}.")
 
-    return RaggedArray.from_files(
+    ra = RaggedArray.from_files(
         indices=ids,
         preprocess_func=preprocess,
         name_coords=gdp.GDP_COORDS,
@@ -612,3 +599,13 @@ def to_raggedarray(
         filename_pattern=filename_pattern,
         tmp_path=tmp_path,
     )
+
+    # set dynamic global attributes
+    ra.attrs_global[
+        "time_coverage_start"
+    ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+    ra.attrs_global[
+        "time_coverage_end"
+    ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+
+    return ra
diff --git a/clouddrift/adapters/gdp6h.py b/clouddrift/adapters/gdp6h.py
@@ -5,21 +5,21 @@
 """
 
 import clouddrift.adapters.gdp as gdp
+from clouddrift.adapters.utils import download_with_progress
 from clouddrift.raggedarray import RaggedArray
-from datetime import datetime
+from datetime import datetime, timedelta
 import numpy as np
 import urllib.request
-import concurrent.futures
 import re
 import tempfile
-from tqdm import tqdm
 from typing import Optional
 import os
 import warnings
 import xarray as xr
 
+GDP_VERSION = "September 2023"
 
-GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/netcdf/"
+GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h/"
 GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp6h")
 GDP_DATA = [
     "lon",
@@ -57,20 +57,20 @@ def download(
     Returns
     -------
     out : list
-        List of retrived drifters
+        List of retrieved drifters
     """
 
     print(f"Downloading GDP 6-hourly data to {tmp_path}...")
 
     # Create a temporary directory if doesn't already exists.
     os.makedirs(tmp_path, exist_ok=True)
 
-    pattern = "drifter_[0-9]*.nc"
+    pattern = "drifter_6h_[0-9]*.nc"
     directory_list = [
-        "buoydata_1_5000",
-        "buoydata_5001_10000",
-        "buoydata_10001_15000",
-        "buoydata_15001_oct22",
+        "netcdf_1_5000",
+        "netcdf_5001_10000",
+        "netcdf_10001_15000",
+        "netcdf_15001_current",
     ]
 
     # retrieve all drifter ID numbers
@@ -94,25 +94,14 @@ def download(
             rng = np.random.RandomState(42)
             drifter_urls = rng.choice(drifter_urls, n_random_id, replace=False)
 
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        # Asynchronously download individual netCDF files
-        list(
-            tqdm(
-                executor.map(
-                    gdp.fetch_netcdf,
-                    drifter_urls,
-                    [os.path.join(tmp_path, os.path.basename(f)) for f in drifter_urls],
-                ),
-                total=len(drifter_urls),
-                desc="Downloading files",
-                ncols=80,
-            )
-        )
+    download_with_progress(
+        [(url, os.path.join(tmp_path, os.path.basename(url))) for url in drifter_urls]
+    )
 
     # Download the metadata so we can order the drifter IDs by end date.
     gdp_metadata = gdp.get_gdp_metadata()
     drifter_ids = [
-        int(os.path.basename(f).split("_")[1].split(".")[0]) for f in drifter_urls
+        int(os.path.basename(f).split("_")[2].split(".")[0]) for f in drifter_urls
     ]
 
     return gdp.order_by_date(gdp_metadata, drifter_ids)
@@ -392,9 +381,11 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
 
     # global attributes
     attrs = {
-        "title": "Global Drifter Program hourly drifting buoy collection",
-        "history": f"version {gdp.GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
+        "title": "Global Drifter Program drifting buoy collection",
+        "history": f"version {GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
         "Conventions": "CF-1.6",
+        "time_coverage_start": "",
+        "time_coverage_end": "",
         "date_created": datetime.now().isoformat(),
         "publisher_name": "GDP Drifter DAC",
         "publisher_email": "aoml.dftr@noaa.gov",
@@ -485,13 +476,23 @@ def to_raggedarray(
     """
     ids = download(drifter_ids, n_random_id, GDP_DATA_URL, tmp_path)
 
-    return RaggedArray.from_files(
+    ra = RaggedArray.from_files(
         indices=ids,
         preprocess_func=preprocess,
         name_coords=gdp.GDP_COORDS,
         name_meta=gdp.GDP_METADATA,
         name_data=GDP_DATA,
         rowsize_func=gdp.rowsize,
-        filename_pattern="drifter_{id}.nc",
+        filename_pattern="drifter_6h_{id}.nc",
         tmp_path=tmp_path,
     )
+
+    # update dynamic global attributes
+    ra.attrs_global[
+        "time_coverage_start"
+    ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+    ra.attrs_global[
+        "time_coverage_end"
+    ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+
+    return ra
diff --git a/clouddrift/adapters/glad.py b/clouddrift/adapters/glad.py
@@ -13,11 +13,10 @@
 ---------
 Özgökmen, Tamay. 2013. GLAD experiment CODE-style drifter trajectories (low-pass filtered, 15 minute interval records), northern Gulf of Mexico near DeSoto Canyon, July-October 2012. Distributed by: Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC), Harte Research Institute, Texas A&M University–Corpus Christi. doi:10.7266/N7VD6WC8
 """
-from io import StringIO
+from clouddrift.adapters.utils import download_with_progress
+from io import BytesIO
 import numpy as np
 import pandas as pd
-import requests
-import tqdm
 import xarray as xr
 
 
@@ -27,15 +26,9 @@ def get_dataframe() -> pd.DataFrame:
     # GRIIDC server doesn't provide Content-Length header, so we'll hardcode
     # the expected data length here.
     file_size = 155330876
-    r = requests.get(url, stream=True)
-    progress_bar = tqdm.tqdm(total=file_size, unit="iB", unit_scale=True)
-    buf = StringIO()
-    for chunk in r.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            buf.write(chunk.decode("utf-8"))
-            progress_bar.update(len(chunk))
+    buf = BytesIO(b"")
+    download_with_progress([(url, buf)])
     buf.seek(0)
-    progress_bar.close()
     column_names = [
         "id",
         "date",

diff --git a/clouddrift/adapters/mosaic.py b/clouddrift/adapters/mosaic.py
@@ -18,15 +18,17 @@
 >>> from clouddrift.adapters import mosaic
 >>> ds = mosaic.to_xarray()
 """
-from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
+from io import BytesIO
 import numpy as np
 import pandas as pd
 import requests
 from tqdm import tqdm
 import xarray as xr
 import xml.etree.ElementTree as ET
 
+from clouddrift.adapters.utils import download_with_progress
+
 MOSAIC_VERSION = "2022"
 
 
@@ -56,15 +58,10 @@ def get_dataframes() -> tuple[pd.DataFrame, pd.DataFrame]:
         range(len(sensor_ids)), key=lambda k: order_index[sensor_ids[k]]
     )
     sorted_data_urls = [data_urls[i] for i in sorted_indices]
+    buffers = [BytesIO(b"") * len(sorted_data_urls)]
 
-    with ThreadPoolExecutor() as executor:
-        dfs = tqdm(
-            executor.map(pd.read_csv, sorted_data_urls),
-            total=len(sorted_data_urls),
-            desc="Downloading data",
-            ncols=80,
-        )
-
+    download_with_progress(zip(sorted_data_urls, buffers), desc="Downloading data")
+    dfs = [pd.read_csv(b) for b in buffers]
     obs_df = pd.concat(dfs)
 
     # Use the index of the concatenated DataFrame to determine the count/rowsize

diff --git a/clouddrift/adapters/subsurface_floats.py b/clouddrift/adapters/subsurface_floats.py
@@ -17,10 +17,11 @@
 import pandas as pd
 import scipy.io
 import tempfile
-import urllib.request
 import xarray as xr
 import warnings
 
+from clouddrift.adapters.utils import download_with_progress
+
 SUBSURFACE_FLOATS_DATA_URL = (
     "https://www.aoml.noaa.gov/phod/float_traj/files/allFloats_12122017.mat"
 )
@@ -31,13 +32,7 @@
 
 
 def download(file: str):
-    if not os.path.isfile(file):
-        print(
-            f"Downloading Subsurface float trajectories from {SUBSURFACE_FLOATS_DATA_URL} to {file}..."
-        )
-        urllib.request.urlretrieve(SUBSURFACE_FLOATS_DATA_URL, file)
-    else:
-        warnings.warn(f"{file} already exists; skip download.")
+    download_with_progress([(SUBSURFACE_FLOATS_DATA_URL, file)])
 
 
 def to_xarray(