From fa1421093c82ce679739d6cf47392ea16eac8ba4 Mon Sep 17 00:00:00 2001
From: Shane Elipot <selipot@miami.edu>
Date: Thu, 21 Dec 2023 12:49:55 -0500
Subject: [PATCH] docstring edits

---
 clouddrift/ragged.py | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/clouddrift/ragged.py b/clouddrift/ragged.py
index 2771732d..5753877a 100644
--- a/clouddrift/ragged.py
+++ b/clouddrift/ragged.py
@@ -545,22 +545,22 @@ def subset(
     obs_dim_name: str = "obs",
     full_trajectories=False,
 ) -> xr.Dataset:
-    """Subset the dataset as a function of one or many criteria. The criteria are
-    passed as a dictionary, where a variable to subset is assigned to either a
-    range (valuemin, valuemax), a list [value1, value2, valueN], a single value,
-    or a masking function applied to every trajectory using ``apply_ragged``
+    """Subset a ragged array dataset as a function of one or more criteria. 
+    The criteria are passed with a dictionary, where a dictionary key 
+    is a variable to subset and the associated dictionary value is either a range 
+    (valuemin, valuemax), a list [value1, value2, valueN], a single value, or a 
+    masking function applied to every row of the ragged array using ``apply_ragged``.
 
-    This function relies on specific names of the dataset dimensions and the
-    rowsize variables. The default expected values are listed in the Parameters
-    section, however, if your dataset uses different names for these dimensions
-    and variables, you can specify them using the optional arguments.
+    This function needs to know the names of the dimensions of the ragged array dataset 
+    (traj_dim_name and obs_dim_name), and the name of the rowsize variable (rowsize_var_name).
+    Default values are provided for these arguments (see below), but they can be changed if needed.
 
     Parameters
     ----------
     ds : xr.Dataset
-        Lagrangian dataset stored in two-dimensional or ragged array format
+        Dataset stored as ragged arrays
     criteria : dict
-        dictionary containing the variables and the ranges/values/functions to subset
+        dictionary containing the variables (as keys) and the ranges/values/functions (as values) to subset
     id_var_name : str, optional
         Name of the variable containing the ID of the trajectories (default is "id")
     rowsize_var_name : str, optional
@@ -570,7 +570,7 @@ def subset(
     obs_dim_name : str, optional
         Name of the observation dimension (default is "obs")
     full_trajectories : bool, optional
-        If True, it returns the complete trajectories where at least one observation
+        If True, it returns the complete trajectories (rows) where at least one observation
         matches the criteria, rather than just the segments where the criteria are satisfied.
         Default is False.
 
@@ -582,7 +582,8 @@ def subset(
     Examples
     --------
     Criteria are combined on any data or metadata variables part of the Dataset.
-    The following examples are based on the GDP dataset.
+    The following examples are based on NOAA GDP datasets which can be accessed with the
+    clouddrift.datasets module.
 
     Retrieve a region, like the Gulf of Mexico, using ranges of latitude and longitude:
 
@@ -630,18 +631,11 @@ def subset(
 
     >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "sst": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))})
 
-    Retrieve observations every 24 hours after the first one, trajectory-wise:
-
-    >>> def daily_masking(
-    >>>     traj_time: xr.DataArray
-    >>> ) -> np.ndarray:
-    >>>     traj_time = traj_time.astype("float64")
-    >>>     traj_time /= 1e9  # to seconds
-    >>>     traj_time -= traj_time[0]  # start from 0
-    >>>     mask = (traj_time % (24*60*60)) == 0  # get only obs every 24 hours after the first one
-    >>>     rowsize = int(mask.sum())  # the number of obs per traj has to be updated
-    >>>     return mask, rowsize
-    >>> subset(ds, {"time": daily_masking})
+    You can also use a function to filter the data. For example, retrieve every other observation
+    of each trajectory (row):
+
+    >>> func = (lambda arr: ((arr - arr[0]) % 2) == 0)
+    >>> subset(ds, {"time": func})
 
     Raises
     ------