From 7d579149ec57465b1ccb2a4215163bff5afce77c Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 3 Dec 2024 16:00:44 -0800
Subject: [PATCH 01/10] Ingest: Re-align to pathogen-repo-guide

This commit updates the `ingest` directory to match the latest version of the:
https://github.com/nextstrain/pathogen-repo-guide/tree/b4001735fa55448f1426ff90cca93c667ca121ce/ingest
---
 .../nextstrain-automation/config.yaml         |   3 +-
 ingest/defaults/config.yaml                   | 142 ++++++++++--------
 ingest/rules/curate.smk                       |  22 ++-
 ingest/rules/fetch_from_ncbi.smk              |  69 +++++----
 ingest/rules/nextclade.smk                    |   2 +-
 ingest/rules/split_serotypes.smk              |   4 +-
 6 files changed, 132 insertions(+), 110 deletions(-)

diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
index 853cafb3..ce21105f 100644
--- a/ingest/build-configs/nextstrain-automation/config.yaml
+++ b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -15,8 +15,7 @@ s3_dst: "s3://nextstrain-data/files/workflows/dengue"
 
 # Mapping of files to upload
 files_to_upload:
-  genbank.ndjson.xz: data/genbank.ndjson
-  all_sequences.ndjson.xz: data/sequences.ndjson
+  ncbi.ndjson.zst: data/ncbi.ndjson
   metadata_all.tsv.zst: results/metadata_all.tsv
   sequences_all.fasta.zst: results/sequences_all.fasta
   metadata_denv1.tsv.zst: results/metadata_denv1.tsv
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index bc6d54d2..c31885d8 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -1,9 +1,14 @@
-# Sources of sequences to include in the ingest run
-sources: ['genbank']
-# Pathogen NCBI Taxonomy ID
-ncbi_taxon_id: '12637'
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run to completion.
+#
+# Define optional config parameters with their default values here so that users
+# do not have to dig through the workflows to figure out the default values
+
+# Required to fetch from NCBI Datasets
+ncbi_taxon_id: "12637"
+
 # The list of NCBI Datasets fields to include from NCBI Datasets output
-# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
 # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
 # Note: the "accession" field MUST be provided to match with the sequences
 ncbi_datasets_fields:
@@ -19,15 +24,23 @@ ncbi_datasets_fields:
   - virus-name
   - length
   - host-name
+  - is-lab-host
   - isolate-lineage-source
   - submitter-names
   - submitter-affiliation
 
-# Params for the curate rule
+# Config parameters related to the curate pipeline
 curate:
-  # NCBI Fields to rename to Nextstrain field names.
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: "defaults/geolocation-rules.tsv"
+  # List of field names to change where the key is the original field name and the value is the new field name
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
     accession: genbank_accession
     accession-rev: genbank_accession_rev
@@ -35,83 +48,82 @@ curate:
     sourcedb: database
     geo-region: region
     geo-location: location
-    host-name: host
     isolate-collection-date: date
-    release-date: release_date
-    update-date: update_date
+    release-date: date_released
+    update-date: date_updated
     virus-tax-id: virus_tax_id
     virus-name: virus_name
+    length: length
+    host-name: host
+    is-lab-host: is_lab_host
+    isolate-lineage-source: sample_type
     submitter-names: authors
     submitter-affiliation: institution
   # Standardized strain name regex
-  # Currently accepts any characters because we do not have a clear standard for strain names
-  strain_regex: '^.+$'
-  # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['genbank_accession']
-  # List of date fields to standardize
-  date_fields: ['date', 'release_date', 'update_date']
-  # Expected date formats present in date fields
+  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
+  strain_regex: "^.+$"
+  # Back up strain name field to use if "strain" doesn"t match regex above
+  strain_backup_fields: ["genbank_accession"]
+  # List of date fields to standardize to ISO format YYYY-MM-DD
+  date_fields: ["date", "date_released", "date_updated"]
+  # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
-  expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
+  expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
   # The expected field that contains the GenBank geo_loc_name
   genbank_location_field: location
-  # Titlecase rules
   titlecase:
-    # Abbreviations not cast to titlecase, keeps uppercase
-    abbreviations: ['USA']
+    # List of string fields to titlecase
+    fields: ["region", "country", "division", "location"]
+    # List of abbreviations not cast to titlecase, keeps uppercase
+    abbreviations: ["USA"]
     # Articles that should not be cast to titlecase
     articles: [
-      'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
-      'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
+      "and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le",
+      "los", "nad", "of", "op", "sur", "the", "y"
     ]
-    # List of string fields to titlecase
-    fields: ['region', 'country', 'division', 'location']
-  # Authors field name
-  authors_field: 'authors'
-  # Authors default value if authors value is empty
-  authors_default_value: '?'
-  # Field name for the generated abbreviated authors
-  abbr_authors_field: 'abbr_authors'
-  # General geolocation rules to apply to geolocation fields
-  geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
-  # Local geolocation rules that are only applicable to mpox data
-  # Local rules can overwrite the general geolocation rules provided above
-  local_geolocation_rules: 'defaults/geolocation-rules.tsv'
-  # User annotations file
-  annotations: 'defaults/annotations.tsv'
+  # Metadata field that contains the list of authors associated with the sequence
+  authors_field: "authors"
+  # Default value to use if the authors field is empty
+  authors_default_value: "?"
+  # Name to use for the generated abbreviated authors field
+  abbr_authors_field: "abbr_authors"
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "defaults/annotations.tsv"
   # Serotype field name inferred from NCBI Genbank annotation
-  serotype_field: 'serotype_genbank'
-  # ID field used to merge annotations
-  annotations_id: 'genbank_accession'
-  # Field to use as the sequence ID in the FASTA file
-  id_field: 'genbank_accession'
-  # Field to use as the sequence in the FASTA file
-  sequence_field: 'sequence'
-  # Final output columns for the metadata TSV
+  serotype_field: "serotype_genbank"
+  # The ID field in the metadata to use to merge the manual annotations
+  annotations_id: "genbank_accession"
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: "genbank_accession"
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_sequence_field: "sequence"
+  # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
-    'strain',
-    'genbank_accession',
-    'genbank_accession_rev',
-    'date',
-    'region',
-    'country',
-    'division',
-    'location',
-    'length',
-    'host',
-    'release_date',
-    'update_date',
-    'abbr_authors',
-    'authors',
-    'institution',
-    'serotype_genbank', # inferred from virus_tax_id
+    "strain",
+    "genbank_accession",
+    "genbank_accession_rev",
+    "date",
+    "region",
+    "country",
+    "division",
+    "location",
+    "length",
+    "host",
+    "is_lab_host",
+    "date_released",
+    "date_updated",
+    "authors",
+    "abbr_authors",
+    "institution",
+    "serotype_genbank", # inferred from virus_tax_id
   ]
 
 nextclade:
   min_length: 1000 # E gene length is approximately 1400nt
   min_seed_cover: 0.1
-  gene: ['E','C','M','pr','NS1','NS2A','NS2B','NS3','NS4A','2K','NS4B','NS5']
+  gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
   # Nextclade Fields to rename to metadata field names.
   field_map:
     seqName: genbank_accession # ID field used to merge annotations
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index 4f1475dc..2b21a39d 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -2,7 +2,7 @@
 This part of the workflow handles curating the data into standardized
 formats and expects input file
 
-    sequences_ndjson = "data/sequences.ndjson"
+    ndjson      = data/ncbi.ndjson
 
 This will produce output files as
 
@@ -13,6 +13,10 @@ Parameters are expected to be defined in `config.curate`.
 """
 
 
+# The following two rules can be ignored if you choose not to use the
+# generalized geolocation rules that are shared across pathogens.
+# The Nextstrain team will try to maintain a generalized set of geolocation
+# rules that can then be overridden by local geolocation rules per pathogen repo.
 rule fetch_general_geolocation_rules:
     output:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
@@ -42,10 +46,16 @@ def format_field_map(field_map: dict[str, str]) -> str:
     """
     return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])
 
-
+# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
+# You may want to add and/or remove steps from the pipeline for custom metadata
+# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
+# records between scripts, so any custom scripts added to the pipeline should expect
+# the input as NDJSON records from stdin and output NDJSON records to stdout.
+# The final step of the pipeline should convert the NDJSON records to two
+# separate files: a metadata TSV and a sequences FASTA.
 rule curate:
     input:
-        sequences_ndjson="data/sequences.ndjson",
+        sequences_ndjson="data/ncbi.ndjson",
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["curate"]["annotations"],
     output:
@@ -53,6 +63,8 @@ rule curate:
         sequences="results/sequences_all.fasta",
     log:
         "logs/curate.txt",
+    benchmark:
+        "benchmarks/curate.txt",
     params:
         field_map=format_field_map(config["curate"]["field_map"]),
         strain_regex=config["curate"]["strain_regex"],
@@ -68,8 +80,8 @@ rule curate:
         abbr_authors_field=config["curate"]["abbr_authors_field"],
         annotations_id=config["curate"]["annotations_id"],
         serotype_field=config["curate"]["serotype_field"],
-        id_field=config["curate"]["id_field"],
-        sequence_field=config["curate"]["sequence_field"],
+        id_field=config["curate"]["output_id_field"],
+        sequence_field=config["curate"]["output_sequence_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index efd9fbb7..55a66936 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -1,34 +1,51 @@
 """
-This part of the workflow handles fetching sequences from various sources.
-Uses `config.sources` to determine which sequences to include in final output.
+This part of the workflow handles fetching sequences and metadata from NCBI.
 
-Currently only fetches sequences from GenBank, but other sources can be
-defined in the config. If adding other sources, add a new rule upstream
-of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the
-file must exist as a static file in the repo.
+REQUIRED INPUTS:
 
-Produces final output as
+    None
 
-    sequences_ndjson = "data/sequences.ndjson"
+OUTPUTS:
 
-"""
+    ndjson = data/ncbi.ndjson
 
+Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/)
+ - requires `ncbi_taxon_id` config
+ - Directly returns NDJSON without custom parsing
+ - Fastest option for large datasets (e.g. SARS-CoV-2)
+ - Only returns metadata fields that are available through NCBI Datasets
+ - Only works for viral genomes
+"""
 
 rule fetch_ncbi_dataset_package:
+    params:
+        ncbi_taxon_id=config["ncbi_taxon_id"],
     output:
         dataset_package=temp("data/ncbi_dataset.zip"),
-    retries: 5  # Requires snakemake 7.7.0 or later
+    # Allow retries in case of network errors
+    retries: 5
     benchmark:
         "benchmarks/fetch_ncbi_dataset_package.txt"
-    params:
-        ncbi_taxon_id=config["ncbi_taxon_id"],
     shell:
         """
-        datasets download virus genome taxon {params.ncbi_taxon_id} \
+        datasets download virus genome taxon {params.ncbi_taxon_id:q} \
             --no-progressbar \
             --filename {output.dataset_package}
         """
 
+# Note: This rule is not part of the default workflow!
+# It is intended to be used as a specific target for users to be able
+# to inspect and explore the full raw metadata from NCBI Datasets.
+rule dump_ncbi_dataset_report:
+    input:
+        dataset_package="data/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv",
+    shell:
+        """
+        dataformat tsv virus-genome \
+            --package {input.dataset_package} > {output.ncbi_dataset_tsv}
+        """
 
 rule extract_ncbi_dataset_sequences:
     input:
@@ -43,9 +60,7 @@ rule extract_ncbi_dataset_sequences:
             ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences}
         """
 
-
 rule format_ncbi_dataset_report:
-    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
     output:
@@ -60,22 +75,21 @@ rule format_ncbi_dataset_report:
             --package {input.dataset_package} \
             --fields {params.ncbi_datasets_fields:q} \
             --elide-header \
+            | csvtk fix-quotes -Ht \
             | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
             | csvtk rename -t -f accession -n accession-rev \
-            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | csvtk del-quotes -t \
             | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
-
 rule format_ncbi_datasets_ndjson:
     input:
         ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta",
         ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
     output:
-        ndjson="data/genbank.ndjson",
-    params:
-        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
+        ndjson="data/ncbi.ndjson",
     log:
         "logs/format_ncbi_datasets_ndjson.txt",
     benchmark:
@@ -91,18 +105,3 @@ rule format_ncbi_datasets_ndjson:
             --duplicate-reporting warn \
             2> {log} > {output.ndjson}
         """
-
-
-def _get_all_sources(wildcards):
-    return [f"data/{source}.ndjson" for source in config["sources"]]
-
-
-rule fetch_all_sequences:
-    input:
-        all_sources=_get_all_sources,
-    output:
-        sequences_ndjson="data/sequences.ndjson",
-    shell:
-        """
-        cat {input.all_sources} > {output.sequences_ndjson}
-        """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index bed20547..643fbb5e 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -138,7 +138,7 @@ rule append_gene_coverage_columns:
     output:
         metadata_all="results/metadata_all.tsv",
     params:
-        id_field=config["curate"]["id_field"],
+        id_field=config["curate"]["output_id_field"],
     shell:
         """
         cp {input.metadata} {output.metadata_all}
diff --git a/ingest/rules/split_serotypes.smk b/ingest/rules/split_serotypes.smk
index 29b22547..f17d1977 100644
--- a/ingest/rules/split_serotypes.smk
+++ b/ingest/rules/split_serotypes.smk
@@ -1,5 +1,5 @@
 """
-This part of the workflow handles splitting the data by serotype either based on the 
+This part of the workflow handles splitting the data by serotype either based on the
 NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate.
 
     metadata = "data/metadata_all.tsv"
@@ -22,7 +22,7 @@ rule split_by_serotype_genbank:
     output:
         sequences = "results/sequences_{serotype}.fasta"
     params:
-        id_field = config["curate"]["id_field"],
+        id_field = config["curate"]["output_id_field"],
         serotype_field = config["curate"]["serotype_field"]
     shell:
         """

From a494822d5e6a2757472e8aab3d90c2171068884d Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 14:35:41 -0800
Subject: [PATCH 02/10] Ingest: Infer URL column from genbank accession for
 node call out

---
 ingest/defaults/config.yaml |  3 +++
 ingest/rules/curate.smk     | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index c31885d8..371f5b58 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -99,6 +99,8 @@ curate:
   output_id_field: "genbank_accession"
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
+  # The field in the NDJSON record that contains the actual GenBank accession
+  genbank_accession: "genbank_accession"
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     "strain",
@@ -118,6 +120,7 @@ curate:
     "abbr_authors",
     "institution",
     "serotype_genbank", # inferred from virus_tax_id
+    "url"
   ]
 
 nextclade:
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index 2b21a39d..53a748b1 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -117,9 +117,29 @@ rule curate:
                 --output-seq-field {params.sequence_field} ) 2>> {log}
         """
 
-rule subset_metadata:
+rule add_metadata_columns:
+    """Add columns to metadata
+    Notable columns:
+    - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
+    """
     input:
         metadata="data/all_metadata_curated.tsv",
+    output:
+        metadata = temp("data/all_metadata_added.tsv")
+    params:
+        accession=config['curate']['genbank_accession']
+    shell:
+        """
+        csvtk mutate2 -t \
+          -n url \
+          -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
+          {input.metadata} \
+        > {output.metadata}
+        """
+
+rule subset_metadata:
+    input:
+        metadata="data/all_metadata_added.tsv",
     output:
         subset_metadata="data/metadata_all.tsv",
     params:

From 57a8457aa807637335b8c5b753622dd59ea3318d Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 14:51:55 -0800
Subject: [PATCH 03/10] Use authors and full_authors fields

Since auspice automatically detects "authors" and we prefer the abbreviated authors
list displayed, change "abbr_authors" to "authors" and "authors" to "full_authors".

This matches the pathogen-repo-guide
---
 ingest/defaults/config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 371f5b58..51e960ea 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -57,7 +57,7 @@ curate:
     host-name: host
     is-lab-host: is_lab_host
     isolate-lineage-source: sample_type
-    submitter-names: authors
+    submitter-names: full_authors
     submitter-affiliation: institution
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
@@ -83,11 +83,11 @@ curate:
       "los", "nad", "of", "op", "sur", "the", "y"
     ]
   # Metadata field that contains the list of authors associated with the sequence
-  authors_field: "authors"
+  authors_field: "full_authors"
   # Default value to use if the authors field is empty
   authors_default_value: "?"
   # Name to use for the generated abbreviated authors field
-  abbr_authors_field: "abbr_authors"
+  abbr_authors_field: "authors"
   # Path to the manual annotations file
   # The path should be relative to the ingest directory
   annotations: "defaults/annotations.tsv"
@@ -117,7 +117,7 @@ curate:
     "date_released",
     "date_updated",
     "authors",
-    "abbr_authors",
+    "full_authors",
     "institution",
     "serotype_genbank", # inferred from virus_tax_id
     "url"

From 1a168b5d3107e9fd5ce9b95fd298774c62ef06a6 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 16:37:14 -0800
Subject: [PATCH 04/10] Ingest: change `genbank_accession` to `accession`

To match the pathogen repo guide, change:

* `genbank_accession` to `accession`
* `genbank_accession_rev` to `accession_version`

There should be a subsequent change in the phylogenetic workflow
---
 ingest/defaults/config.yaml                    | 18 +++++++++---------
 ingest/rules/fetch_from_ncbi.smk               |  6 +++---
 ingest/rules/nextclade.smk                     |  3 +++
 ...ene-converage-from-nextclade-translation.py | 14 ++++++++++----
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 51e960ea..f5c16b2b 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -42,8 +42,8 @@ curate:
   # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    accession: genbank_accession
-    accession-rev: genbank_accession_rev
+    accession: accession
+    accession_version: accession_version
     isolate-lineage: strain
     sourcedb: database
     geo-region: region
@@ -63,7 +63,7 @@ curate:
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: "^.+$"
   # Back up strain name field to use if "strain" doesn"t match regex above
-  strain_backup_fields: ["genbank_accession"]
+  strain_backup_fields: ["accession"]
   # List of date fields to standardize to ISO format YYYY-MM-DD
   date_fields: ["date", "date_released", "date_updated"]
   # List of expected date formats that are present in the date fields provided above
@@ -94,18 +94,18 @@ curate:
   # Serotype field name inferred from NCBI Genbank annotation
   serotype_field: "serotype_genbank"
   # The ID field in the metadata to use to merge the manual annotations
-  annotations_id: "genbank_accession"
+  annotations_id: "accession"
   # The ID field in the metadata to use as the sequence id in the output FASTA file
-  output_id_field: "genbank_accession"
+  output_id_field: "accession"
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
   # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: "genbank_accession"
+  genbank_accession: "accession"
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     "strain",
-    "genbank_accession",
-    "genbank_accession_rev",
+    "accession",
+    "accession_version",
     "date",
     "region",
     "country",
@@ -129,7 +129,7 @@ nextclade:
   gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
   # Nextclade Fields to rename to metadata field names.
   field_map:
-    seqName: genbank_accession # ID field used to merge annotations
+    seqName: accession # ID field used to merge annotations
     clade: genotype_nextclade
     alignmentStart: alignmentStart
     alignmentEnd: alignmentEnd
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index 55a66936..0932c2e2 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -77,8 +77,8 @@ rule format_ncbi_dataset_report:
             --elide-header \
             | csvtk fix-quotes -Ht \
             | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
-            | csvtk rename -t -f accession -n accession-rev \
-            | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | csvtk rename -t -f accession -n accession_version \
+            | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
             | csvtk del-quotes -t \
             | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
@@ -99,7 +99,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession-rev \
+            --seq-id-column accession_version \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index 643fbb5e..06644d61 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -107,10 +107,13 @@ rule calculate_gene_coverage:
         gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
     wildcard_constraints:
         serotype=SEROTYPE_CONSTRAINTS,
+    params:
+        id_field=config["curate"]["output_id_field"],
     shell:
         """
         python scripts/calculate-gene-converage-from-nextclade-translation.py \
           --fasta {input.nextclade_translation} \
+          --out-id {params.id_field} \
           --out-col {wildcards.gene}_coverage \
           > {output.gene_coverage}
         """
diff --git a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
index 4ab7c199..c6bdcb5d 100644
--- a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
+++ b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
@@ -17,6 +17,12 @@ def parse_args():
         type=str,
         help="FASTA file of CDS translations from Nextclade.",
     )
+    parser.add_argument(
+        "--out-id",
+        type=str,
+        default="accession",
+        help="Output record ID.",
+    )
     parser.add_argument(
         "--out-col",
         type=str,
@@ -26,16 +32,16 @@ def parse_args():
     return parser.parse_args()
 
 
-def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
+def calculate_gene_coverage_from_nextclade_cds(fasta, out_id, out_col):
     """
     Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
     """
-    print(f"genbank_accession\t{out_col}")
+    print(f"{out_id}\t{out_col}")
     # Iterate over the sequences in the FASTA file
     for record in SeqIO.parse(fasta, "fasta"):
         sequence_id = record.id
         sequence = str(record.seq)
-        
+
         # Calculate gene coverage
         results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])",  sequence.upper())
         gene_coverage = round(len(results) / len(sequence), 3)
@@ -47,7 +53,7 @@ def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
 def main():
     args = parse_args()
 
-    calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)
+    calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_id, args.out_col)
 
 
 if __name__ == "__main__":

From cca844c330aa9157bb3f34fc80cffab8935b7653 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 16:49:09 -0800
Subject: [PATCH 05/10] Rename phylogenetic/config to phylogenetic/defaults

To be more consistent with the pathogen-repo-guide and the ingest workflow
---
 phylogenetic/Snakefile                           |  2 +-
 .../{config => defaults}/clades_genotypes.tsv    |  0
 .../{config => defaults}/clades_serotypes.tsv    |  0
 .../{config => defaults}/color_orderings.tsv     |  0
 .../{config => defaults}/color_schemes.tsv       |  0
 .../{config => defaults}/config_dengue.yaml      | 16 ++++++++--------
 phylogenetic/{config => defaults}/description.md |  0
 phylogenetic/{config => defaults}/exclude.txt    |  0
 .../{config => defaults}/include_all.txt         |  0
 .../{config => defaults}/include_denv1.txt       |  0
 .../{config => defaults}/include_denv2.txt       |  0
 .../{config => defaults}/include_denv3.txt       |  0
 .../{config => defaults}/include_denv4.txt       |  0
 .../{config => defaults}/reference_all_genome.gb |  0
 .../reference_denv1_genome.gb                    |  0
 .../reference_denv2_genome.gb                    |  0
 .../reference_denv3_genome.gb                    |  0
 .../reference_denv4_genome.gb                    |  0
 phylogenetic/rules/annotate_phylogeny.smk        |  2 +-
 phylogenetic/rules/export.smk                    |  8 ++++----
 phylogenetic/rules/prepare_sequences.smk         |  2 +-
 phylogenetic/rules/prepare_sequences_E.smk       |  8 ++++----
 22 files changed, 19 insertions(+), 19 deletions(-)
 rename phylogenetic/{config => defaults}/clades_genotypes.tsv (100%)
 rename phylogenetic/{config => defaults}/clades_serotypes.tsv (100%)
 rename phylogenetic/{config => defaults}/color_orderings.tsv (100%)
 rename phylogenetic/{config => defaults}/color_schemes.tsv (100%)
 rename phylogenetic/{config => defaults}/config_dengue.yaml (75%)
 rename phylogenetic/{config => defaults}/description.md (100%)
 rename phylogenetic/{config => defaults}/exclude.txt (100%)
 rename phylogenetic/{config => defaults}/include_all.txt (100%)
 rename phylogenetic/{config => defaults}/include_denv1.txt (100%)
 rename phylogenetic/{config => defaults}/include_denv2.txt (100%)
 rename phylogenetic/{config => defaults}/include_denv3.txt (100%)
 rename phylogenetic/{config => defaults}/include_denv4.txt (100%)
 rename phylogenetic/{config => defaults}/reference_all_genome.gb (100%)
 rename phylogenetic/{config => defaults}/reference_denv1_genome.gb (100%)
 rename phylogenetic/{config => defaults}/reference_denv2_genome.gb (100%)
 rename phylogenetic/{config => defaults}/reference_denv3_genome.gb (100%)
 rename phylogenetic/{config => defaults}/reference_denv4_genome.gb (100%)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 46457340..c1e926f1 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -1,4 +1,4 @@
-configfile: "config/config_dengue.yaml"
+configfile: "defaults/config_dengue.yaml"
 
 serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']
 genes = ['genome', 'E']
diff --git a/phylogenetic/config/clades_genotypes.tsv b/phylogenetic/defaults/clades_genotypes.tsv
similarity index 100%
rename from phylogenetic/config/clades_genotypes.tsv
rename to phylogenetic/defaults/clades_genotypes.tsv
diff --git a/phylogenetic/config/clades_serotypes.tsv b/phylogenetic/defaults/clades_serotypes.tsv
similarity index 100%
rename from phylogenetic/config/clades_serotypes.tsv
rename to phylogenetic/defaults/clades_serotypes.tsv
diff --git a/phylogenetic/config/color_orderings.tsv b/phylogenetic/defaults/color_orderings.tsv
similarity index 100%
rename from phylogenetic/config/color_orderings.tsv
rename to phylogenetic/defaults/color_orderings.tsv
diff --git a/phylogenetic/config/color_schemes.tsv b/phylogenetic/defaults/color_schemes.tsv
similarity index 100%
rename from phylogenetic/config/color_schemes.tsv
rename to phylogenetic/defaults/color_schemes.tsv
diff --git a/phylogenetic/config/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml
similarity index 75%
rename from phylogenetic/config/config_dengue.yaml
rename to phylogenetic/defaults/config_dengue.yaml
index fbcd7263..b861686e 100644
--- a/phylogenetic/config/config_dengue.yaml
+++ b/phylogenetic/defaults/config_dengue.yaml
@@ -8,8 +8,8 @@ strain_id_field: "genbank_accession"
 display_strain_field: "strain"
 
 filter:
-  exclude: "config/exclude.txt"
-  include: "config/include_{serotype}.txt"
+  exclude: "defaults/exclude.txt"
+  include: "defaults/include_{serotype}.txt"
   group_by: "year region"
   min_length:
     genome: 5000
@@ -32,11 +32,11 @@ traits:
 
 clades:
   clade_definitions:
-    all: 'config/clades_serotypes.tsv'
-    denv1: 'config/clades_genotypes.tsv'
-    denv2: 'config/clades_genotypes.tsv'
-    denv3: 'config/clades_genotypes.tsv'
-    denv4: 'config/clades_genotypes.tsv'
+    all: 'defaults/clades_serotypes.tsv'
+    denv1: 'defaults/clades_genotypes.tsv'
+    denv2: 'defaults/clades_genotypes.tsv'
+    denv3: 'defaults/clades_genotypes.tsv'
+    denv4: 'defaults/clades_genotypes.tsv'
 
 export:
-  description: "config/description.md"
+  description: "defaults/description.md"
diff --git a/phylogenetic/config/description.md b/phylogenetic/defaults/description.md
similarity index 100%
rename from phylogenetic/config/description.md
rename to phylogenetic/defaults/description.md
diff --git a/phylogenetic/config/exclude.txt b/phylogenetic/defaults/exclude.txt
similarity index 100%
rename from phylogenetic/config/exclude.txt
rename to phylogenetic/defaults/exclude.txt
diff --git a/phylogenetic/config/include_all.txt b/phylogenetic/defaults/include_all.txt
similarity index 100%
rename from phylogenetic/config/include_all.txt
rename to phylogenetic/defaults/include_all.txt
diff --git a/phylogenetic/config/include_denv1.txt b/phylogenetic/defaults/include_denv1.txt
similarity index 100%
rename from phylogenetic/config/include_denv1.txt
rename to phylogenetic/defaults/include_denv1.txt
diff --git a/phylogenetic/config/include_denv2.txt b/phylogenetic/defaults/include_denv2.txt
similarity index 100%
rename from phylogenetic/config/include_denv2.txt
rename to phylogenetic/defaults/include_denv2.txt
diff --git a/phylogenetic/config/include_denv3.txt b/phylogenetic/defaults/include_denv3.txt
similarity index 100%
rename from phylogenetic/config/include_denv3.txt
rename to phylogenetic/defaults/include_denv3.txt
diff --git a/phylogenetic/config/include_denv4.txt b/phylogenetic/defaults/include_denv4.txt
similarity index 100%
rename from phylogenetic/config/include_denv4.txt
rename to phylogenetic/defaults/include_denv4.txt
diff --git a/phylogenetic/config/reference_all_genome.gb b/phylogenetic/defaults/reference_all_genome.gb
similarity index 100%
rename from phylogenetic/config/reference_all_genome.gb
rename to phylogenetic/defaults/reference_all_genome.gb
diff --git a/phylogenetic/config/reference_denv1_genome.gb b/phylogenetic/defaults/reference_denv1_genome.gb
similarity index 100%
rename from phylogenetic/config/reference_denv1_genome.gb
rename to phylogenetic/defaults/reference_denv1_genome.gb
diff --git a/phylogenetic/config/reference_denv2_genome.gb b/phylogenetic/defaults/reference_denv2_genome.gb
similarity index 100%
rename from phylogenetic/config/reference_denv2_genome.gb
rename to phylogenetic/defaults/reference_denv2_genome.gb
diff --git a/phylogenetic/config/reference_denv3_genome.gb b/phylogenetic/defaults/reference_denv3_genome.gb
similarity index 100%
rename from phylogenetic/config/reference_denv3_genome.gb
rename to phylogenetic/defaults/reference_denv3_genome.gb
diff --git a/phylogenetic/config/reference_denv4_genome.gb b/phylogenetic/defaults/reference_denv4_genome.gb
similarity index 100%
rename from phylogenetic/config/reference_denv4_genome.gb
rename to phylogenetic/defaults/reference_denv4_genome.gb
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index 4e3338e4..f62d8d70 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -44,7 +44,7 @@ rule translate:
     input:
         tree = "results/{gene}/tree_{serotype}.nwk",
         node_data = "results/{gene}/nt-muts_{serotype}.json",
-        reference = lambda wildcard: "config/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/config/reference_{serotype}_{gene}.gb"
+        reference = lambda wildcard: "defaults/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb"
     output:
         node_data = "results/{gene}/aa-muts_{serotype}.json"
     shell:
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index c610eec3..92211388 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -22,8 +22,8 @@ import json
 
 rule colors:
     input:
-        color_schemes = "config/color_schemes.tsv",
-        color_orderings = "config/color_orderings.tsv",
+        color_schemes = "defaults/color_schemes.tsv",
+        color_orderings = "defaults/color_orderings.tsv",
         metadata = "data/metadata_{serotype}.tsv",
     output:
         colors = "results/colors_{serotype}.tsv"
@@ -40,7 +40,7 @@ rule colors:
 rule prepare_auspice_config:
     """Prepare the auspice config file for each serotypes"""
     output:
-        auspice_config="results/config/{gene}/auspice_config_{serotype}.json",
+        auspice_config="results/defaults/{gene}/auspice_config_{serotype}.json",
     params:
         replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"genotype_nextclade",
         replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"Dengue Genotype (Nextclade)",
@@ -140,7 +140,7 @@ rule export:
         nt_muts = "results/{gene}/nt-muts_{serotype}.json",
         aa_muts = "results/{gene}/aa-muts_{serotype}.json",
         description = config["export"]["description"],
-        auspice_config = "results/config/{gene}/auspice_config_{serotype}.json",
+        auspice_config = "results/defaults/{gene}/auspice_config_{serotype}.json",
         colors = "results/colors_{serotype}.tsv",
     output:
         auspice_json = "results/{gene}/raw_dengue_{serotype}.json"
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index 60144f85..ec4157ac 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -85,7 +85,7 @@ rule align:
     """
     input:
         sequences = "results/{gene}/filtered_{serotype}.fasta",
-        reference = lambda wildcard: "config/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/config/reference_{serotype}_{gene}.gb"
+        reference = lambda wildcard: "defaults/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb"
     output:
         alignment = "results/{gene}/aligned_{serotype}.fasta"
     shell:
diff --git a/phylogenetic/rules/prepare_sequences_E.smk b/phylogenetic/rules/prepare_sequences_E.smk
index 877fa304..c3c30030 100644
--- a/phylogenetic/rules/prepare_sequences_E.smk
+++ b/phylogenetic/rules/prepare_sequences_E.smk
@@ -21,10 +21,10 @@ rule generate_E_reference_files:
     Generating reference files for the E gene
     """
     input:
-        reference = "config/reference_{serotype}_genome.gb",
+        reference = "defaults/reference_{serotype}_genome.gb",
     output:
-        fasta = "results/config/reference_{serotype}_E.fasta",
-        genbank = "results/config/reference_{serotype}_E.gb",
+        fasta = "results/defaults/reference_{serotype}_E.fasta",
+        genbank = "results/defaults/reference_{serotype}_E.gb",
     params:
         gene = "E",
     shell:
@@ -42,7 +42,7 @@ rule align_and_extract_E:
     """
     input:
         sequences = "data/sequences_{serotype}.fasta",
-        reference = "results/config/reference_{serotype}_E.fasta"
+        reference = "results/defaults/reference_{serotype}_E.fasta"
     output:
         sequences = "results/E/sequences_{serotype}.fasta"
     params:

From a56685a2371886d467ffbd238f0e957afb2e6d60 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 16:55:57 -0800
Subject: [PATCH 06/10] Phylogenetic: Change `genbank_accession` to `accession`

Use the combination of `accession` and `url` fields in the phylogenetic build.
This change follows a similar change to the ingest workflow.
---
 phylogenetic/defaults/config_dengue.yaml     | 2 +-
 phylogenetic/example_data/metadata_all.tsv   | 2 +-
 phylogenetic/example_data/metadata_denv1.tsv | 2 +-
 phylogenetic/example_data/metadata_denv2.tsv | 2 +-
 phylogenetic/example_data/metadata_denv3.tsv | 2 +-
 phylogenetic/example_data/metadata_denv4.tsv | 2 +-
 phylogenetic/rules/export.smk                | 3 ++-
 7 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/phylogenetic/defaults/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml
index b861686e..1e5854e6 100644
--- a/phylogenetic/defaults/config_dengue.yaml
+++ b/phylogenetic/defaults/config_dengue.yaml
@@ -4,7 +4,7 @@
 sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst"
 metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst"
 
-strain_id_field: "genbank_accession"
+strain_id_field: "accession"
 display_strain_field: "strain"
 
 filter:
diff --git a/phylogenetic/example_data/metadata_all.tsv b/phylogenetic/example_data/metadata_all.tsv
index 01a411da..ba8c0c35 100644
--- a/phylogenetic/example_data/metadata_all.tsv
+++ b/phylogenetic/example_data/metadata_all.tsv
@@ -1,4 +1,4 @@
-strain	virus	genbank_accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
 DENV1/VIETNAM/BIDV1862/2008	dengue	FJ410220	2008-XX-XX	Southeast Asia	Vietnam	Vietnam	Vietnam	?	genome	Alvarado et al.	?	?	?	?
 DENV3/WALLIS_AND_FUTUNA/1404838/1989	dengue	JQ920487	1989-08-14	?	Wallis And Futuna	French Overseas Collectivity	Wallis And Futuna	?	genome	Aubry et al.	?	?	?	?
 DENV/BRAZIL/SJRP778/2013	dengue	KT438585	2013-02-04	South America	Brazil	Brazil	Brazil	?	genome	Araujo et al.	?	?	?	?
diff --git a/phylogenetic/example_data/metadata_denv1.tsv b/phylogenetic/example_data/metadata_denv1.tsv
index 0574a0c7..b697eb54 100644
--- a/phylogenetic/example_data/metadata_denv1.tsv
+++ b/phylogenetic/example_data/metadata_denv1.tsv
@@ -1,4 +1,4 @@
-strain	virus	genbank_accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
 DENV1/INDIA/237/1962	dengue	JF297572	1962-XX-XX	South Asia	India	India	India	?	genome	Cecilia et al.	?	?	?	?
 DENV1/CHINA/RL6/2013	dengue	KJ470727	2013-09-XX	China	China	China	China	?	genome	Guo et al.	?	?	?	?
 DENV1/THAILAND/AILANDKPPKDV08491NC04528V0L/2007	dengue	JQ993108	2007-XX-XX	Southeast Asia	Thailand	Thailand	Thailand	?	genome	Aldstadt et al.	?	?	?	?
diff --git a/phylogenetic/example_data/metadata_denv2.tsv b/phylogenetic/example_data/metadata_denv2.tsv
index 5c694c00..31f88305 100644
--- a/phylogenetic/example_data/metadata_denv2.tsv
+++ b/phylogenetic/example_data/metadata_denv2.tsv
@@ -1,4 +1,4 @@
-strain	virus	genbank_accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
 DENV2/INDONESIA/MKS0084/2007	dengue	KC762660	2007-11-21	Southeast Asia	Indonesia	Makassar	Makassar	?	genome	Bifani et al.	?	?	?	?
 DENV2/TAIWAN/71/2002	dengue	EF016253	2002-XX-XX	China	Taiwan	Taiwan	Taiwan	?	genome	Ke et al.	?	?	?	?
 DENV2/PAKISTAN/8D/2011	dengue	JX042512	2011-XX-XX	West Asia	Pakistan	Pakistan	Pakistan	?	genome	Alvi et al.	?	?	?	?
diff --git a/phylogenetic/example_data/metadata_denv3.tsv b/phylogenetic/example_data/metadata_denv3.tsv
index 18c5b0aa..911e8cef 100644
--- a/phylogenetic/example_data/metadata_denv3.tsv
+++ b/phylogenetic/example_data/metadata_denv3.tsv
@@ -1,4 +1,4 @@
-strain	virus	genbank_accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
 DENV3/PUERTO_RICO/BIDV1615/2004	dengue	FJ373302	2004-XX-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	?	genome	Alvarado et al.	?	?	?	?
 DENV3/THAILAND/BIDV2323/2001	dengue	FJ744734	2001-XX-XX	Southeast Asia	Thailand	Kamphaeng Phet	Kamphaeng Phet	?	genome	Alvarado et al.	?	?	?	?
 DENV3/BANGLADESH/JACOB/NA	dengue	AY656673	?	South Asia	Bangladesh	Bangladesh	Bangladesh	?	genome	Aaskov et al.	?	?	?	?
diff --git a/phylogenetic/example_data/metadata_denv4.tsv b/phylogenetic/example_data/metadata_denv4.tsv
index 9d9ce386..d3175b21 100644
--- a/phylogenetic/example_data/metadata_denv4.tsv
+++ b/phylogenetic/example_data/metadata_denv4.tsv
@@ -1,4 +1,4 @@
-strain	virus	genbank_accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
+strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
 DENV4/ECUADOR/FSE2098/2006	dengue	GQ139572	2006-XX-XX	South America	Ecuador	Ecuador	Ecuador	?	genome	Alava et al.	?	?	?	?
 DENV4/BRAZIL/SJRP580/2012	dengue	KP188562	2012-12-10	South America	Brazil	Brazil	Brazil	?	genome	Araujo et al.	?	?	?	?
 DENV4/BRAZIL/GUSP0670/2013	dengue	KP704043	2013-03-17	South America	Brazil	Brazil	Brazil	?	genome	Balarini et al.	?	?	?	?
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index 92211388..3b50f37f 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -103,7 +103,8 @@ rule prepare_auspice_config:
               "author"
             ],
             "metadata_columns": [
-              "genbank_accession"
+              "accession",
+              "url"
             ]
           }
 

From 4acba653960ff03c751f7ff9d3a718010d3e91ec Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 18 Dec 2024 14:04:04 -0800
Subject: [PATCH 07/10] Automatically drop is_lab_host=true records during
 phylogenetic workflow

---
 phylogenetic/rules/prepare_sequences.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index ec4157ac..44f42249 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -75,7 +75,8 @@ rule filter:
             --group-by {params.group_by} \
             --sequences-per-group {params.sequences_per_group} \
             --min-length {params.min_length} \
-            --exclude-where country=? region=? date=? \
+            --exclude-where country=? region=? date=? is_lab_host='true' \
+            --query-columns is_lab_host:str
         """
 
 rule align:

From 65520e5d097022acfee88ed5fde9d0eef0cdf3d6 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 18 Dec 2024 15:52:41 -0800
Subject: [PATCH 08/10] Add frequencies panels

---
 phylogenetic/Snakefile                   |  1 +
 phylogenetic/defaults/config_dengue.yaml |  6 ++++
 phylogenetic/rules/export.smk            | 35 ++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index c1e926f1..633fe057 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -10,6 +10,7 @@ wildcard_constraints:
 rule all:
     input:
         auspice_json = expand("auspice/dengue_{serotype}_{gene}.json", serotype=serotypes, gene=genes),
+        tip_frequencies_json= expand("auspice/dengue_{serotype}_{gene}_tip-frequencies.json", serotype=serotypes, gene=genes)
 
 include: "rules/prepare_sequences.smk"
 include: "rules/prepare_sequences_E.smk"
diff --git a/phylogenetic/defaults/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml
index 1e5854e6..320bdedf 100644
--- a/phylogenetic/defaults/config_dengue.yaml
+++ b/phylogenetic/defaults/config_dengue.yaml
@@ -38,5 +38,11 @@ clades:
     denv3: 'defaults/clades_genotypes.tsv'
     denv4: 'defaults/clades_genotypes.tsv'
 
+tip_frequencies:
+    min_date: "1980-01-01"
+    max_date: "6M"
+    narrow_bandwidth: 0.2
+    wide_bandwidth: 0.6
+
 export:
   description: "defaults/description.md"
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index 3b50f37f..aa49e686 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -102,6 +102,12 @@ rule prepare_auspice_config:
               "region",
               "author"
             ],
+            "panels": [
+              "tree",
+              "map",
+              "entropy",
+              "frequencies"
+            ],
             "metadata_columns": [
               "accession",
               "url"
@@ -179,3 +185,32 @@ rule final_strain_name:
             --display-strain-name {params.display_strain_field} \
             --output {output.auspice_json}
         """
+
+rule tip_frequencies:
+    """
+    Estimating KDE frequencies for tips
+    """
+    input:
+        tree = "results/{gene}/tree_{serotype}.nwk",
+        metadata = "data/metadata_{serotype}.tsv",
+    output:
+        tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json"
+    params:
+        strain_id = config["strain_id_field"],
+        min_date = config["tip_frequencies"]["min_date"],
+        max_date = config["tip_frequencies"]["max_date"],
+        narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"],
+        wide_bandwidth = config["tip_frequencies"]["wide_bandwidth"]
+    shell:
+        r"""
+        augur frequencies \
+            --method kde \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --min-date {params.min_date} \
+            --max-date {params.max_date} \
+            --narrow-bandwidth {params.narrow_bandwidth} \
+            --wide-bandwidth {params.wide_bandwidth} \
+            --output {output.tip_freq}
+        """

From 1b6424bd57fa62376b08ccad0979405b78cad374 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 31 Dec 2024 13:10:23 -0800
Subject: [PATCH 09/10] Tip frequency strain ID must match the displayed strain
 ID in the final auspice JSON file

@j23414 initially believed that the Augur frequency function could utilize the 'accession' strain IDs to match the tree.json file. However, @joverlee521 pointed out that the frequencies JSON actually requires matching the final 'strain' IDs as found in the Auspice JSON file. Otherwise Auspice cannot link the strains to their frequencies.

Co-authored-by: Jover Lee <joverlee521@gmail.com>
---
 phylogenetic/rules/export.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index aa49e686..c344b50a 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -196,7 +196,7 @@ rule tip_frequencies:
     output:
         tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json"
     params:
-        strain_id = config["strain_id_field"],
+        strain_id = config["display_strain_field"],
         min_date = config["tip_frequencies"]["min_date"],
         max_date = config["tip_frequencies"]["max_date"],
         narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"],

From b8b2fa34accf8188f6c5bad49000c6bd17e41c48 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 31 Dec 2024 15:10:32 -0800
Subject: [PATCH 10/10] Drop set_final_strain_name.py

Remove set_final_strain_name.py so that both the main JSON and the
frequencies JSON use the accession as the strain id. This simplifies the
augur frequencies call and the "accession" vs "strain" confusion.
---
 phylogenetic/rules/export.smk                 | 27 +++----------
 phylogenetic/scripts/set_final_strain_name.py | 38 -------------------
 2 files changed, 5 insertions(+), 60 deletions(-)
 delete mode 100755 phylogenetic/scripts/set_final_strain_name.py

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index c344b50a..cf1a1fd7 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -95,7 +95,8 @@ rule prepare_auspice_config:
             ],
             "display_defaults": {
               "map_triplicate": True,
-              "color_by": params.replace_clade_key
+              "color_by": params.replace_clade_key,
+              "tip_label": "strain"
             },
             "filters": [
               "country",
@@ -110,6 +111,7 @@ rule prepare_auspice_config:
             ],
             "metadata_columns": [
               "accession",
+              "strain",
               "url"
             ]
           }
@@ -150,7 +152,7 @@ rule export:
         auspice_config = "results/defaults/{gene}/auspice_config_{serotype}.json",
         colors = "results/colors_{serotype}.tsv",
     output:
-        auspice_json = "results/{gene}/raw_dengue_{serotype}.json"
+        auspice_json = "auspice/dengue_{serotype}_{gene}.json"
     params:
         strain_id = config.get("strain_id_field", "strain"),
     shell:
@@ -167,25 +169,6 @@ rule export:
             --output {output.auspice_json}
         """
 
-rule final_strain_name:
-    input:
-        auspice_json="results/{gene}/raw_dengue_{serotype}.json",
-        metadata="data/metadata_{serotype}.tsv",
-    output:
-        auspice_json="auspice/dengue_{serotype}_{gene}.json"
-    params:
-        strain_id=config.get("strain_id_field", "strain"),
-        display_strain_field=config.get("display_strain_field", "strain"),
-    shell:
-        """
-        python3 scripts/set_final_strain_name.py \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --input-auspice-json {input.auspice_json} \
-            --display-strain-name {params.display_strain_field} \
-            --output {output.auspice_json}
-        """
-
 rule tip_frequencies:
     """
     Estimating KDE frequencies for tips
@@ -196,7 +179,7 @@ rule tip_frequencies:
     output:
         tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json"
     params:
-        strain_id = config["display_strain_field"],
+        strain_id = config["strain_id_field"],
         min_date = config["tip_frequencies"]["min_date"],
         max_date = config["tip_frequencies"]["max_date"],
         narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"],
diff --git a/phylogenetic/scripts/set_final_strain_name.py b/phylogenetic/scripts/set_final_strain_name.py
deleted file mode 100755
index 08ca9359..00000000
--- a/phylogenetic/scripts/set_final_strain_name.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import pandas as pd
-import json, argparse
-from augur.io import read_metadata
-
-def replace_name_recursive(node, lookup):
-    if node["name"] in lookup:
-        node["name"] = lookup[node["name"]]
-
-    if "children" in node:
-        for child in node["children"]:
-            replace_name_recursive(child, lookup)
-
-if __name__=="__main__":
-    parser = argparse.ArgumentParser(
-        description="Swaps out the strain names in the Auspice JSON with the final strain name",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
-    parser.add_argument('--metadata', type=str, required=True, help="input data")
-    parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
-    parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
-    args = parser.parse_args()
-
-    metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
-    name_lookup = {}
-    for ri, row in metadata.iterrows():
-        strain_id = row.name
-        name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]
-
-    with open(args.input_auspice_json, 'r') as fh:
-        data = json.load(fh)
-
-    replace_name_recursive(data['tree'], name_lookup)
-
-    with open(args.output, 'w') as fh:
-        json.dump(data, fh)