nextstrain · tsibley · Aug 2, 2022 · Aug 1, 2022
diff --git a/CHANGES.md b/CHANGES.md
@@ -13,6 +13,55 @@ development source code and as such may not be routinely kept up to date.
 
 # __NEXT__
 
+This release contains **a potentially-breaking change** for existing usages of
+`nextstrain remote download`.  The change is described below.
+
+## Improvements
+
+* The local filenames produced by `nextstrain remote download` now include
+  more of the remote dataset/narrative path.  This reduces the potential for
+  ambiguous filenames and makes it easier to copy datasets/narratives between
+  destinations (e.g. from one group to another) while retaining the same path.
+  It is, however, a **potentially-breaking change** if you're relying on the
+  filenames of the downloaded datasets/narratives (e.g. for automation).
+
+  For example, downloading `nextstrain.org/flu/seasonal/h3n2/ha/2y` previously
+  produced the local files:
+
+  ```
+  2y.json
+  2y_root-sequence.json
+  2y_tip-frequencies.json
+  ```
+
+  which could easily conflict with the similarly-named
+  `nextstrain.org/flu/seasonal/h3n2/na/2y`,
+  `nextstrain.org/flu/seasonal/h1n1pdm/ha/2y`, etc.  The downloaded files are
+  now named:
+
+  ```
+  flu_seasonal_h3n2_ha_2y.json
+  flu_seasonal_h3n2_ha_2y_root-sequence.json
+  flu_seasonal_h3n2_ha_2y_tip-frequencies.json
+  ```
+
+  Within groups, filenames are similarly longer but the group name is not
+  included.  For example, downloading `groups/blab/ncov/cross-species/cat`
+  previously produced:
+
+  ```
+  cat.json
+  cat_root-sequence.json
+  cat_tip-frequencies.json
+  ```
+
+  and now produces:
+
+  ```
+  ncov_cross-species_cat.json
+  ncov_cross-species_cat_root-sequence.json
+  ncov_cross-species_cat_tip-frequencies.json
+  ```
 
 # 4.2.0 (29 July 2022)
 

diff --git a/nextstrain/cli/command/remote/download.py b/nextstrain/cli/command/remote/download.py
@@ -8,9 +8,9 @@
 
 which creates three files in the current directory::
 
-    2y.json
-    2y_root-sequence.json
-    2y_tip-frequencies.json
+    flu_seasonal_h3n2_ha_2y.json
+    flu_seasonal_h3n2_ha_2y_root-sequence.json
+    flu_seasonal_h3n2_ha_2y_tip-frequencies.json
 
 The --recursively option allows for downloading multiple datasets or narratives
 at once, e.g. to download all the datasets under "ncov/open/…" into an existing
@@ -20,12 +20,12 @@
 
 which creates files for each dataset::
 
-    sars-cov-2/global.json
-    sars-cov-2/global_root-sequence.json
-    sars-cov-2/global_tip-frequencies.json
-    sars-cov-2/africa.json
-    sars-cov-2/africa_root-sequence.json
-    sars-cov-2/africa_tip-frequencies.json
+    sars-cov-2/ncov_open_global.json
+    sars-cov-2/ncov_open_global_root-sequence.json
+    sars-cov-2/ncov_open_global_tip-frequencies.json
+    sars-cov-2/ncov_open_africa.json
+    sars-cov-2/ncov_open_africa_root-sequence.json
+    sars-cov-2/ncov_open_africa_tip-frequencies.json
     …
 
 See `nextstrain remote --help` for more information on remote sources.

diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py
@@ -331,33 +331,10 @@ def download(url: urllib.parse.ParseResult, local_path: Path, recursively: bool
 
                     # Local destination
                     if local_path.is_dir():
-                        if recursively:
-                            # If we're recursively downloading under a path
-                            # which is itself a dataset that exists, e.g.
-                            # `nextstrain remote download -r a/b` when the
-                            # following datasets exist:
-                            #
-                            #       a/b
-                            #       a/b/x
-                            #       a/b/y
-                            #       a/b/z
-                            #
-                            # then we need to include a little of the given
-                            # path in the local filenames, e.g. strip only "a",
-                            # leaving "b".  Otherwise, e.g. if a/b doesn't
-                            # exist itself only a/b/x, etc, then we can strip
-                            # "a/b" entirely.
-                            if path in {r.path for r in resources}:
-                                base_path = path.parent
-                            else:
-                                base_path = path
-
-                            local_name = (
-                                str(resource.path.relative_to(base_path))
-                                    .lstrip("/")
-                                    .replace("/", "_"))
-                        else:
-                            local_name = resource.path.name
+                        local_name = (
+                            str(resource.path.relative_to(namespace(resource.path)))
+                                .lstrip("/")
+                                .replace("/", "_"))
 
                         destination = local_path / local_name
                     else:
@@ -550,16 +527,72 @@ def prefixed(path: NormalizedPath) -> bool:
     >>> prefixed(normalize_path("narratives/"))
     False
     """
+    return str(path.relative_to(namespace(path))) != "."
+
+
+def namespace(path: NormalizedPath) -> NormalizedPath:
+    """
+    Return the top-level nextstrain.org namespace ("source" in that codebase's
+    parlance + optional "narratives/" part) for *path*.
+
+    >>> namespace(normalize_path("groups/blab/abc"))
+    NormalizedPath('/groups/blab')
+    >>> namespace(normalize_path("groups/blab/abc/def"))
+    NormalizedPath('/groups/blab')
+    >>> namespace(normalize_path("groups/blab/narratives/abc"))
+    NormalizedPath('/groups/blab/narratives')
+    >>> namespace(normalize_path("groups/blab/narratives/abc/def"))
+    NormalizedPath('/groups/blab/narratives')
+    >>> namespace(normalize_path("groups/blab"))
+    NormalizedPath('/groups/blab')
+    >>> namespace(normalize_path("groups/blab/narratives/"))
+    NormalizedPath('/groups/blab/narratives')
+
+    >>> namespace(normalize_path("staging/wxyz"))
+    NormalizedPath('/staging')
+    >>> namespace(normalize_path("staging/tuv/wxyz"))
+    NormalizedPath('/staging')
+    >>> namespace(normalize_path("staging/narratives/tuv"))
+    NormalizedPath('/staging/narratives')
+    >>> namespace(normalize_path("staging/narratives/tuv/wxyz"))
+    NormalizedPath('/staging/narratives')
+    >>> namespace(normalize_path("staging"))
+    NormalizedPath('/staging')
+    >>> namespace(normalize_path("staging/narratives/"))
+    NormalizedPath('/staging/narratives')
+
+    >>> namespace(normalize_path("abc"))
+    NormalizedPath('/')
+    >>> namespace(normalize_path("abc/def"))
+    NormalizedPath('/')
+    >>> namespace(normalize_path("narratives/abc"))
+    NormalizedPath('/narratives')
+    >>> namespace(normalize_path("narratives/abc/def"))
+    NormalizedPath('/narratives')
+    >>> namespace(normalize_path("/"))
+    NormalizedPath('/')
+    >>> namespace(normalize_path("narratives/"))
+    NormalizedPath('/narratives')
+    """
     path_ = str(path)
 
-    if glob_match(path_, "/groups/*{,/**}"):
-        return not glob_match(path_, ["/groups/*", "/groups/*/narratives"])
+    if glob_match(path_, "/groups/*/narratives{,/**}"):
+        return normalize_path(f"/groups/{path.parts[2]}/narratives")
+
+    elif glob_match(path_, "/groups/*{,/**}"):
+        return normalize_path(f"/groups/{path.parts[2]}")
+
+    elif glob_match(path_, "/staging/narratives{,/**}"):
+        return normalize_path("/staging/narratives")
 
     elif glob_match(path_, "/staging{,/**}"):
-        return path_ not in {"/staging", "/staging/narratives"}
+        return normalize_path("/staging")
+
+    elif glob_match(path_, "/narratives{,/**}"):
+        return normalize_path("/narratives")
 
     else:
-        return path_ not in {"/", "/narratives"}
+        return normalize_path("/")
 
 
 def api_endpoint(path: Union[str, PurePosixPath]) -> str: