From 7d579149ec57465b1ccb2a4215163bff5afce77c Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 3 Dec 2024 16:00:44 -0800 Subject: [PATCH 01/10] Ingest: Re-align to pathogen-repo-guide This commit updates the `ingest` directory to match the latest version of the: https://github.com/nextstrain/pathogen-repo-guide/tree/b4001735fa55448f1426ff90cca93c667ca121ce/ingest --- .../nextstrain-automation/config.yaml | 3 +- ingest/defaults/config.yaml | 142 ++++++++++-------- ingest/rules/curate.smk | 22 ++- ingest/rules/fetch_from_ncbi.smk | 69 +++++---- ingest/rules/nextclade.smk | 2 +- ingest/rules/split_serotypes.smk | 4 +- 6 files changed, 132 insertions(+), 110 deletions(-) diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 853cafb3..ce21105f 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -15,8 +15,7 @@ s3_dst: "s3://nextstrain-data/files/workflows/dengue" # Mapping of files to upload files_to_upload: - genbank.ndjson.xz: data/genbank.ndjson - all_sequences.ndjson.xz: data/sequences.ndjson + ncbi.ndjson.zst: data/ncbi.ndjson metadata_all.tsv.zst: results/metadata_all.tsv sequences_all.fasta.zst: results/sequences_all.fasta metadata_denv1.tsv.zst: results/metadata_denv1.tsv diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index bc6d54d2..c31885d8 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -1,9 +1,14 @@ -# Sources of sequences to include in the ingest run -sources: ['genbank'] -# Pathogen NCBI Taxonomy ID -ncbi_taxon_id: '12637' +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run to completion. +# +# Define optional config parameters with their default values here so that users +# do not have to dig through the workflows to figure out the default values + +# Required to fetch from NCBI Datasets +ncbi_taxon_id: "12637" + # The list of NCBI Datasets fields to include from NCBI Datasets output -# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields # Note: the "accession" field MUST be provided to match with the sequences ncbi_datasets_fields: @@ -19,15 +24,23 @@ ncbi_datasets_fields: - virus-name - length - host-name + - is-lab-host - isolate-lineage-source - submitter-names - submitter-affiliation -# Params for the curate rule +# Config parameters related to the curate pipeline curate: - # NCBI Fields to rename to Nextstrain field names. - # This is the first step in the pipeline, so any references to field names - # in the configs below should use the new field names + # URL pointed to public generalized geolocation rules + # For the Nextstrain team, this is currently + # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" + geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" + # The path to the local geolocation rules within the pathogen repo + # The path should be relative to the ingest directory. + local_geolocation_rules: "defaults/geolocation-rules.tsv" + # List of field names to change where the key is the original field name and the value is the new field name + # The original field names should match the ncbi_datasets_fields provided above. + # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: accession: genbank_accession accession-rev: genbank_accession_rev @@ -35,83 +48,82 @@ curate: sourcedb: database geo-region: region geo-location: location - host-name: host isolate-collection-date: date - release-date: release_date - update-date: update_date + release-date: date_released + update-date: date_updated virus-tax-id: virus_tax_id virus-name: virus_name + length: length + host-name: host + is-lab-host: is_lab_host + isolate-lineage-source: sample_type submitter-names: authors submitter-affiliation: institution # Standardized strain name regex - # Currently accepts any characters because we do not have a clear standard for strain names - strain_regex: '^.+$' - # Back up strain name field if 'strain' doesn't match regex above - strain_backup_fields: ['genbank_accession'] - # List of date fields to standardize - date_fields: ['date', 'release_date', 'update_date'] - # Expected date formats present in date fields + # Currently accepts any characters because we do not have a clear standard for strain names across pathogens + strain_regex: "^.+$" + # Back up strain name field to use if "strain" doesn"t match regex above + strain_backup_fields: ["genbank_accession"] + # List of date fields to standardize to ISO format YYYY-MM-DD + date_fields: ["date", "date_released", "date_updated"] + # List of expected date formats that are present in the date fields provided above # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes - expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ'] + expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"] # The expected field that contains the GenBank geo_loc_name genbank_location_field: location - # Titlecase rules titlecase: - # Abbreviations not cast to titlecase, keeps uppercase - abbreviations: ['USA'] + # List of string fields to titlecase + fields: ["region", "country", "division", "location"] + # List of abbreviations not cast to titlecase, keeps uppercase + abbreviations: ["USA"] # Articles that should not be cast to titlecase articles: [ - 'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le', - 'los', 'nad', 'of', 'op', 'sur', 'the', 'y' + "and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le", + "los", "nad", "of", "op", "sur", "the", "y" ] - # List of string fields to titlecase - fields: ['region', 'country', 'division', 'location'] - # Authors field name - authors_field: 'authors' - # Authors default value if authors value is empty - authors_default_value: '?' - # Field name for the generated abbreviated authors - abbr_authors_field: 'abbr_authors' - # General geolocation rules to apply to geolocation fields - geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv' - # Local geolocation rules that are only applicable to mpox data - # Local rules can overwrite the general geolocation rules provided above - local_geolocation_rules: 'defaults/geolocation-rules.tsv' - # User annotations file - annotations: 'defaults/annotations.tsv' + # Metadata field that contains the list of authors associated with the sequence + authors_field: "authors" + # Default value to use if the authors field is empty + authors_default_value: "?" + # Name to use for the generated abbreviated authors field + abbr_authors_field: "abbr_authors" + # Path to the manual annotations file + # The path should be relative to the ingest directory + annotations: "defaults/annotations.tsv" # Serotype field name inferred from NCBI Genbank annotation - serotype_field: 'serotype_genbank' - # ID field used to merge annotations - annotations_id: 'genbank_accession' - # Field to use as the sequence ID in the FASTA file - id_field: 'genbank_accession' - # Field to use as the sequence in the FASTA file - sequence_field: 'sequence' - # Final output columns for the metadata TSV + serotype_field: "serotype_genbank" + # The ID field in the metadata to use to merge the manual annotations + annotations_id: "genbank_accession" + # The ID field in the metadata to use as the sequence id in the output FASTA file + output_id_field: "genbank_accession" + # The field in the NDJSON record that contains the actual genomic sequence + output_sequence_field: "sequence" + # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ - 'strain', - 'genbank_accession', - 'genbank_accession_rev', - 'date', - 'region', - 'country', - 'division', - 'location', - 'length', - 'host', - 'release_date', - 'update_date', - 'abbr_authors', - 'authors', - 'institution', - 'serotype_genbank', # inferred from virus_tax_id + "strain", + "genbank_accession", + "genbank_accession_rev", + "date", + "region", + "country", + "division", + "location", + "length", + "host", + "is_lab_host", + "date_released", + "date_updated", + "authors", + "abbr_authors", + "institution", + "serotype_genbank", # inferred from virus_tax_id ] nextclade: min_length: 1000 # E gene length is approximately 1400nt min_seed_cover: 0.1 - gene: ['E','C','M','pr','NS1','NS2A','NS2B','NS3','NS4A','2K','NS4B','NS5'] + gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"] # Nextclade Fields to rename to metadata field names. field_map: seqName: genbank_accession # ID field used to merge annotations diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 4f1475dc..2b21a39d 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -2,7 +2,7 @@ This part of the workflow handles curating the data into standardized formats and expects input file - sequences_ndjson = "data/sequences.ndjson" + ndjson = data/ncbi.ndjson This will produce output files as @@ -13,6 +13,10 @@ Parameters are expected to be defined in `config.curate`. """ +# The following two rules can be ignored if you choose not to use the +# generalized geolocation rules that are shared across pathogens. +# The Nextstrain team will try to maintain a generalized set of geolocation +# rules that can then be overridden by local geolocation rules per pathogen repo. rule fetch_general_geolocation_rules: output: general_geolocation_rules="data/general-geolocation-rules.tsv", @@ -42,10 +46,16 @@ def format_field_map(field_map: dict[str, str]) -> str: """ return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()]) - +# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data. +# You may want to add and/or remove steps from the pipeline for custom metadata +# curation for your pathogen. Note that the curate pipeline is streaming NDJSON +# records between scripts, so any custom scripts added to the pipeline should expect +# the input as NDJSON records from stdin and output NDJSON records to stdout. +# The final step of the pipeline should convert the NDJSON records to two +# separate files: a metadata TSV and a sequences FASTA. rule curate: input: - sequences_ndjson="data/sequences.ndjson", + sequences_ndjson="data/ncbi.ndjson", all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], output: @@ -53,6 +63,8 @@ rule curate: sequences="results/sequences_all.fasta", log: "logs/curate.txt", + benchmark: + "benchmarks/curate.txt", params: field_map=format_field_map(config["curate"]["field_map"]), strain_regex=config["curate"]["strain_regex"], @@ -68,8 +80,8 @@ rule curate: abbr_authors_field=config["curate"]["abbr_authors_field"], annotations_id=config["curate"]["annotations_id"], serotype_field=config["curate"]["serotype_field"], - id_field=config["curate"]["id_field"], - sequence_field=config["curate"]["sequence_field"], + id_field=config["curate"]["output_id_field"], + sequence_field=config["curate"]["output_sequence_field"], shell: """ (cat {input.sequences_ndjson} \ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index efd9fbb7..55a66936 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -1,34 +1,51 @@ """ -This part of the workflow handles fetching sequences from various sources. -Uses `config.sources` to determine which sequences to include in final output. +This part of the workflow handles fetching sequences and metadata from NCBI. -Currently only fetches sequences from GenBank, but other sources can be -defined in the config. If adding other sources, add a new rule upstream -of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the -file must exist as a static file in the repo. +REQUIRED INPUTS: -Produces final output as + None - sequences_ndjson = "data/sequences.ndjson" +OUTPUTS: -""" + ndjson = data/ncbi.ndjson +Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) + - requires `ncbi_taxon_id` config + - Directly returns NDJSON without custom parsing + - Fastest option for large datasets (e.g. SARS-CoV-2) + - Only returns metadata fields that are available through NCBI Datasets + - Only works for viral genomes +""" rule fetch_ncbi_dataset_package: + params: + ncbi_taxon_id=config["ncbi_taxon_id"], output: dataset_package=temp("data/ncbi_dataset.zip"), - retries: 5 # Requires snakemake 7.7.0 or later + # Allow retries in case of network errors + retries: 5 benchmark: "benchmarks/fetch_ncbi_dataset_package.txt" - params: - ncbi_taxon_id=config["ncbi_taxon_id"], shell: """ - datasets download virus genome taxon {params.ncbi_taxon_id} \ + datasets download virus genome taxon {params.ncbi_taxon_id:q} \ --no-progressbar \ --filename {output.dataset_package} """ +# Note: This rule is not part of the default workflow! +# It is intended to be used as a specific target for users to be able +# to inspect and explore the full raw metadata from NCBI Datasets. +rule dump_ncbi_dataset_report: + input: + dataset_package="data/ncbi_dataset.zip", + output: + ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv", + shell: + """ + dataformat tsv virus-genome \ + --package {input.dataset_package} > {output.ncbi_dataset_tsv} + """ rule extract_ncbi_dataset_sequences: input: @@ -43,9 +60,7 @@ rule extract_ncbi_dataset_sequences: ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} """ - rule format_ncbi_dataset_report: - # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", output: @@ -60,22 +75,21 @@ rule format_ncbi_dataset_report: --package {input.dataset_package} \ --fields {params.ncbi_datasets_fields:q} \ --elide-header \ + | csvtk fix-quotes -Ht \ | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \ | csvtk rename -t -f accession -n accession-rev \ - | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ + | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \ + | csvtk del-quotes -t \ | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} """ - rule format_ncbi_datasets_ndjson: input: ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta", ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", output: - ndjson="data/genbank.ndjson", - params: - ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), + ndjson="data/ncbi.ndjson", log: "logs/format_ncbi_datasets_ndjson.txt", benchmark: @@ -91,18 +105,3 @@ rule format_ncbi_datasets_ndjson: --duplicate-reporting warn \ 2> {log} > {output.ndjson} """ - - -def _get_all_sources(wildcards): - return [f"data/{source}.ndjson" for source in config["sources"]] - - -rule fetch_all_sequences: - input: - all_sources=_get_all_sources, - output: - sequences_ndjson="data/sequences.ndjson", - shell: - """ - cat {input.all_sources} > {output.sequences_ndjson} - """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index bed20547..643fbb5e 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -138,7 +138,7 @@ rule append_gene_coverage_columns: output: metadata_all="results/metadata_all.tsv", params: - id_field=config["curate"]["id_field"], + id_field=config["curate"]["output_id_field"], shell: """ cp {input.metadata} {output.metadata_all} diff --git a/ingest/rules/split_serotypes.smk b/ingest/rules/split_serotypes.smk index 29b22547..f17d1977 100644 --- a/ingest/rules/split_serotypes.smk +++ b/ingest/rules/split_serotypes.smk @@ -1,5 +1,5 @@ """ -This part of the workflow handles splitting the data by serotype either based on the +This part of the workflow handles splitting the data by serotype either based on the NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate. metadata = "data/metadata_all.tsv" @@ -22,7 +22,7 @@ rule split_by_serotype_genbank: output: sequences = "results/sequences_{serotype}.fasta" params: - id_field = config["curate"]["id_field"], + id_field = config["curate"]["output_id_field"], serotype_field = config["curate"]["serotype_field"] shell: """ From a494822d5e6a2757472e8aab3d90c2171068884d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 14:35:41 -0800 Subject: [PATCH 02/10] Ingest: Infer URL column from genbank accession for node call out --- ingest/defaults/config.yaml | 3 +++ ingest/rules/curate.smk | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index c31885d8..371f5b58 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -99,6 +99,8 @@ curate: output_id_field: "genbank_accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" + # The field in the NDJSON record that contains the actual GenBank accession + genbank_accession: "genbank_accession" # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ "strain", @@ -118,6 +120,7 @@ curate: "abbr_authors", "institution", "serotype_genbank", # inferred from virus_tax_id + "url" ] nextclade: diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 2b21a39d..53a748b1 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -117,9 +117,29 @@ rule curate: --output-seq-field {params.sequence_field} ) 2>> {log} """ -rule subset_metadata: +rule add_metadata_columns: + """Add columns to metadata + Notable columns: + - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). + """ input: metadata="data/all_metadata_curated.tsv", + output: + metadata = temp("data/all_metadata_added.tsv") + params: + accession=config['curate']['genbank_accession'] + shell: + """ + csvtk mutate2 -t \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \ + {input.metadata} \ + > {output.metadata} + """ + +rule subset_metadata: + input: + metadata="data/all_metadata_added.tsv", output: subset_metadata="data/metadata_all.tsv", params: From 57a8457aa807637335b8c5b753622dd59ea3318d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 14:51:55 -0800 Subject: [PATCH 03/10] Use authors and full_authors fields Since auspice automatically detects "authors" and we prefer the abbreviated authors list displayed, change "abbr_authors" to "authors" and "authors" to "full_authors". This matches the pathogen-repo-guide --- ingest/defaults/config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 371f5b58..51e960ea 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -57,7 +57,7 @@ curate: host-name: host is-lab-host: is_lab_host isolate-lineage-source: sample_type - submitter-names: authors + submitter-names: full_authors submitter-affiliation: institution # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names across pathogens @@ -83,11 +83,11 @@ curate: "los", "nad", "of", "op", "sur", "the", "y" ] # Metadata field that contains the list of authors associated with the sequence - authors_field: "authors" + authors_field: "full_authors" # Default value to use if the authors field is empty authors_default_value: "?" # Name to use for the generated abbreviated authors field - abbr_authors_field: "abbr_authors" + abbr_authors_field: "authors" # Path to the manual annotations file # The path should be relative to the ingest directory annotations: "defaults/annotations.tsv" @@ -117,7 +117,7 @@ curate: "date_released", "date_updated", "authors", - "abbr_authors", + "full_authors", "institution", "serotype_genbank", # inferred from virus_tax_id "url" From 1a168b5d3107e9fd5ce9b95fd298774c62ef06a6 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 16:37:14 -0800 Subject: [PATCH 04/10] Ingest: change `genbank_accession` to `accession` To match the pathogen repo guide, change: * `genbank_accession` to `accession` * `genbank_accession_rev` to `accession_version` There should be a subsequent change in the phylogenetic workflow --- ingest/defaults/config.yaml | 18 +++++++++--------- ingest/rules/fetch_from_ncbi.smk | 6 +++--- ingest/rules/nextclade.smk | 3 +++ ...ene-converage-from-nextclade-translation.py | 14 ++++++++++---- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 51e960ea..f5c16b2b 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -42,8 +42,8 @@ curate: # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - accession: genbank_accession - accession-rev: genbank_accession_rev + accession: accession + accession_version: accession_version isolate-lineage: strain sourcedb: database geo-region: region @@ -63,7 +63,7 @@ curate: # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: "^.+$" # Back up strain name field to use if "strain" doesn"t match regex above - strain_backup_fields: ["genbank_accession"] + strain_backup_fields: ["accession"] # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ["date", "date_released", "date_updated"] # List of expected date formats that are present in the date fields provided above @@ -94,18 +94,18 @@ curate: # Serotype field name inferred from NCBI Genbank annotation serotype_field: "serotype_genbank" # The ID field in the metadata to use to merge the manual annotations - annotations_id: "genbank_accession" + annotations_id: "accession" # The ID field in the metadata to use as the sequence id in the output FASTA file - output_id_field: "genbank_accession" + output_id_field: "accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" # The field in the NDJSON record that contains the actual GenBank accession - genbank_accession: "genbank_accession" + genbank_accession: "accession" # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ "strain", - "genbank_accession", - "genbank_accession_rev", + "accession", + "accession_version", "date", "region", "country", @@ -129,7 +129,7 @@ nextclade: gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"] # Nextclade Fields to rename to metadata field names. field_map: - seqName: genbank_accession # ID field used to merge annotations + seqName: accession # ID field used to merge annotations clade: genotype_nextclade alignmentStart: alignmentStart alignmentEnd: alignmentEnd diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 55a66936..0932c2e2 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -77,8 +77,8 @@ rule format_ncbi_dataset_report: --elide-header \ | csvtk fix-quotes -Ht \ | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \ - | csvtk rename -t -f accession -n accession-rev \ - | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \ + | csvtk rename -t -f accession -n accession_version \ + | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \ | csvtk del-quotes -t \ | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} @@ -99,7 +99,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession-rev \ + --seq-id-column accession_version \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 643fbb5e..06644d61 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -107,10 +107,13 @@ rule calculate_gene_coverage: gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS, + params: + id_field=config["curate"]["output_id_field"], shell: """ python scripts/calculate-gene-converage-from-nextclade-translation.py \ --fasta {input.nextclade_translation} \ + --out-id {params.id_field} \ --out-col {wildcards.gene}_coverage \ > {output.gene_coverage} """ diff --git a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py index 4ab7c199..c6bdcb5d 100644 --- a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py +++ b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py @@ -17,6 +17,12 @@ def parse_args(): type=str, help="FASTA file of CDS translations from Nextclade.", ) + parser.add_argument( + "--out-id", + type=str, + default="accession", + help="Output record ID.", + ) parser.add_argument( "--out-col", type=str, @@ -26,16 +32,16 @@ def parse_args(): return parser.parse_args() -def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): +def calculate_gene_coverage_from_nextclade_cds(fasta, out_id, out_col): """ Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade. """ - print(f"genbank_accession\t{out_col}") + print(f"{out_id}\t{out_col}") # Iterate over the sequences in the FASTA file for record in SeqIO.parse(fasta, "fasta"): sequence_id = record.id sequence = str(record.seq) - + # Calculate gene coverage results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper()) gene_coverage = round(len(results) / len(sequence), 3) @@ -47,7 +53,7 @@ def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): def main(): args = parse_args() - calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col) + calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_id, args.out_col) if __name__ == "__main__": From cca844c330aa9157bb3f34fc80cffab8935b7653 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 16:49:09 -0800 Subject: [PATCH 05/10] Rename phylogenetic/config to phylogenetic/defaults To be more consistent with the pathogen-repo-guide and the ingest workflow --- phylogenetic/Snakefile | 2 +- .../{config => defaults}/clades_genotypes.tsv | 0 .../{config => defaults}/clades_serotypes.tsv | 0 .../{config => defaults}/color_orderings.tsv | 0 .../{config => defaults}/color_schemes.tsv | 0 .../{config => defaults}/config_dengue.yaml | 16 ++++++++-------- phylogenetic/{config => defaults}/description.md | 0 phylogenetic/{config => defaults}/exclude.txt | 0 .../{config => defaults}/include_all.txt | 0 .../{config => defaults}/include_denv1.txt | 0 .../{config => defaults}/include_denv2.txt | 0 .../{config => defaults}/include_denv3.txt | 0 .../{config => defaults}/include_denv4.txt | 0 .../{config => defaults}/reference_all_genome.gb | 0 .../reference_denv1_genome.gb | 0 .../reference_denv2_genome.gb | 0 .../reference_denv3_genome.gb | 0 .../reference_denv4_genome.gb | 0 phylogenetic/rules/annotate_phylogeny.smk | 2 +- phylogenetic/rules/export.smk | 8 ++++---- phylogenetic/rules/prepare_sequences.smk | 2 +- phylogenetic/rules/prepare_sequences_E.smk | 8 ++++---- 22 files changed, 19 insertions(+), 19 deletions(-) rename phylogenetic/{config => defaults}/clades_genotypes.tsv (100%) rename phylogenetic/{config => defaults}/clades_serotypes.tsv (100%) rename phylogenetic/{config => defaults}/color_orderings.tsv (100%) rename phylogenetic/{config => defaults}/color_schemes.tsv (100%) rename phylogenetic/{config => defaults}/config_dengue.yaml (75%) rename phylogenetic/{config => defaults}/description.md (100%) rename phylogenetic/{config => defaults}/exclude.txt (100%) rename phylogenetic/{config => defaults}/include_all.txt (100%) rename phylogenetic/{config => defaults}/include_denv1.txt (100%) rename phylogenetic/{config => defaults}/include_denv2.txt (100%) rename phylogenetic/{config => defaults}/include_denv3.txt (100%) rename phylogenetic/{config => defaults}/include_denv4.txt (100%) rename phylogenetic/{config => defaults}/reference_all_genome.gb (100%) rename phylogenetic/{config => defaults}/reference_denv1_genome.gb (100%) rename phylogenetic/{config => defaults}/reference_denv2_genome.gb (100%) rename phylogenetic/{config => defaults}/reference_denv3_genome.gb (100%) rename phylogenetic/{config => defaults}/reference_denv4_genome.gb (100%) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 46457340..c1e926f1 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,4 +1,4 @@ -configfile: "config/config_dengue.yaml" +configfile: "defaults/config_dengue.yaml" serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] genes = ['genome', 'E'] diff --git a/phylogenetic/config/clades_genotypes.tsv b/phylogenetic/defaults/clades_genotypes.tsv similarity index 100% rename from phylogenetic/config/clades_genotypes.tsv rename to phylogenetic/defaults/clades_genotypes.tsv diff --git a/phylogenetic/config/clades_serotypes.tsv b/phylogenetic/defaults/clades_serotypes.tsv similarity index 100% rename from phylogenetic/config/clades_serotypes.tsv rename to phylogenetic/defaults/clades_serotypes.tsv diff --git a/phylogenetic/config/color_orderings.tsv b/phylogenetic/defaults/color_orderings.tsv similarity index 100% rename from phylogenetic/config/color_orderings.tsv rename to phylogenetic/defaults/color_orderings.tsv diff --git a/phylogenetic/config/color_schemes.tsv b/phylogenetic/defaults/color_schemes.tsv similarity index 100% rename from phylogenetic/config/color_schemes.tsv rename to phylogenetic/defaults/color_schemes.tsv diff --git a/phylogenetic/config/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml similarity index 75% rename from phylogenetic/config/config_dengue.yaml rename to phylogenetic/defaults/config_dengue.yaml index fbcd7263..b861686e 100644 --- a/phylogenetic/config/config_dengue.yaml +++ b/phylogenetic/defaults/config_dengue.yaml @@ -8,8 +8,8 @@ strain_id_field: "genbank_accession" display_strain_field: "strain" filter: - exclude: "config/exclude.txt" - include: "config/include_{serotype}.txt" + exclude: "defaults/exclude.txt" + include: "defaults/include_{serotype}.txt" group_by: "year region" min_length: genome: 5000 @@ -32,11 +32,11 @@ traits: clades: clade_definitions: - all: 'config/clades_serotypes.tsv' - denv1: 'config/clades_genotypes.tsv' - denv2: 'config/clades_genotypes.tsv' - denv3: 'config/clades_genotypes.tsv' - denv4: 'config/clades_genotypes.tsv' + all: 'defaults/clades_serotypes.tsv' + denv1: 'defaults/clades_genotypes.tsv' + denv2: 'defaults/clades_genotypes.tsv' + denv3: 'defaults/clades_genotypes.tsv' + denv4: 'defaults/clades_genotypes.tsv' export: - description: "config/description.md" + description: "defaults/description.md" diff --git a/phylogenetic/config/description.md b/phylogenetic/defaults/description.md similarity index 100% rename from phylogenetic/config/description.md rename to phylogenetic/defaults/description.md diff --git a/phylogenetic/config/exclude.txt b/phylogenetic/defaults/exclude.txt similarity index 100% rename from phylogenetic/config/exclude.txt rename to phylogenetic/defaults/exclude.txt diff --git a/phylogenetic/config/include_all.txt b/phylogenetic/defaults/include_all.txt similarity index 100% rename from phylogenetic/config/include_all.txt rename to phylogenetic/defaults/include_all.txt diff --git a/phylogenetic/config/include_denv1.txt b/phylogenetic/defaults/include_denv1.txt similarity index 100% rename from phylogenetic/config/include_denv1.txt rename to phylogenetic/defaults/include_denv1.txt diff --git a/phylogenetic/config/include_denv2.txt b/phylogenetic/defaults/include_denv2.txt similarity index 100% rename from phylogenetic/config/include_denv2.txt rename to phylogenetic/defaults/include_denv2.txt diff --git a/phylogenetic/config/include_denv3.txt b/phylogenetic/defaults/include_denv3.txt similarity index 100% rename from phylogenetic/config/include_denv3.txt rename to phylogenetic/defaults/include_denv3.txt diff --git a/phylogenetic/config/include_denv4.txt b/phylogenetic/defaults/include_denv4.txt similarity index 100% rename from phylogenetic/config/include_denv4.txt rename to phylogenetic/defaults/include_denv4.txt diff --git a/phylogenetic/config/reference_all_genome.gb b/phylogenetic/defaults/reference_all_genome.gb similarity index 100% rename from phylogenetic/config/reference_all_genome.gb rename to phylogenetic/defaults/reference_all_genome.gb diff --git a/phylogenetic/config/reference_denv1_genome.gb b/phylogenetic/defaults/reference_denv1_genome.gb similarity index 100% rename from phylogenetic/config/reference_denv1_genome.gb rename to phylogenetic/defaults/reference_denv1_genome.gb diff --git a/phylogenetic/config/reference_denv2_genome.gb b/phylogenetic/defaults/reference_denv2_genome.gb similarity index 100% rename from phylogenetic/config/reference_denv2_genome.gb rename to phylogenetic/defaults/reference_denv2_genome.gb diff --git a/phylogenetic/config/reference_denv3_genome.gb b/phylogenetic/defaults/reference_denv3_genome.gb similarity index 100% rename from phylogenetic/config/reference_denv3_genome.gb rename to phylogenetic/defaults/reference_denv3_genome.gb diff --git a/phylogenetic/config/reference_denv4_genome.gb b/phylogenetic/defaults/reference_denv4_genome.gb similarity index 100% rename from phylogenetic/config/reference_denv4_genome.gb rename to phylogenetic/defaults/reference_denv4_genome.gb diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 4e3338e4..f62d8d70 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -44,7 +44,7 @@ rule translate: input: tree = "results/{gene}/tree_{serotype}.nwk", node_data = "results/{gene}/nt-muts_{serotype}.json", - reference = lambda wildcard: "config/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/config/reference_{serotype}_{gene}.gb" + reference = lambda wildcard: "defaults/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb" output: node_data = "results/{gene}/aa-muts_{serotype}.json" shell: diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index c610eec3..92211388 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -22,8 +22,8 @@ import json rule colors: input: - color_schemes = "config/color_schemes.tsv", - color_orderings = "config/color_orderings.tsv", + color_schemes = "defaults/color_schemes.tsv", + color_orderings = "defaults/color_orderings.tsv", metadata = "data/metadata_{serotype}.tsv", output: colors = "results/colors_{serotype}.tsv" @@ -40,7 +40,7 @@ rule colors: rule prepare_auspice_config: """Prepare the auspice config file for each serotypes""" output: - auspice_config="results/config/{gene}/auspice_config_{serotype}.json", + auspice_config="results/defaults/{gene}/auspice_config_{serotype}.json", params: replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"genotype_nextclade", replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"Dengue Genotype (Nextclade)", @@ -140,7 +140,7 @@ rule export: nt_muts = "results/{gene}/nt-muts_{serotype}.json", aa_muts = "results/{gene}/aa-muts_{serotype}.json", description = config["export"]["description"], - auspice_config = "results/config/{gene}/auspice_config_{serotype}.json", + auspice_config = "results/defaults/{gene}/auspice_config_{serotype}.json", colors = "results/colors_{serotype}.tsv", output: auspice_json = "results/{gene}/raw_dengue_{serotype}.json" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 60144f85..ec4157ac 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -85,7 +85,7 @@ rule align: """ input: sequences = "results/{gene}/filtered_{serotype}.fasta", - reference = lambda wildcard: "config/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/config/reference_{serotype}_{gene}.gb" + reference = lambda wildcard: "defaults/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/defaults/reference_{serotype}_{gene}.gb" output: alignment = "results/{gene}/aligned_{serotype}.fasta" shell: diff --git a/phylogenetic/rules/prepare_sequences_E.smk b/phylogenetic/rules/prepare_sequences_E.smk index 877fa304..c3c30030 100644 --- a/phylogenetic/rules/prepare_sequences_E.smk +++ b/phylogenetic/rules/prepare_sequences_E.smk @@ -21,10 +21,10 @@ rule generate_E_reference_files: Generating reference files for the E gene """ input: - reference = "config/reference_{serotype}_genome.gb", + reference = "defaults/reference_{serotype}_genome.gb", output: - fasta = "results/config/reference_{serotype}_E.fasta", - genbank = "results/config/reference_{serotype}_E.gb", + fasta = "results/defaults/reference_{serotype}_E.fasta", + genbank = "results/defaults/reference_{serotype}_E.gb", params: gene = "E", shell: @@ -42,7 +42,7 @@ rule align_and_extract_E: """ input: sequences = "data/sequences_{serotype}.fasta", - reference = "results/config/reference_{serotype}_E.fasta" + reference = "results/defaults/reference_{serotype}_E.fasta" output: sequences = "results/E/sequences_{serotype}.fasta" params: From a56685a2371886d467ffbd238f0e957afb2e6d60 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 16:55:57 -0800 Subject: [PATCH 06/10] Phylogenetic: Change `genbank_accession` to `accession` Use the combination of `accession` and `url` fields in the phylogenetic build. This change follows a similar change to the ingest workflow. --- phylogenetic/defaults/config_dengue.yaml | 2 +- phylogenetic/example_data/metadata_all.tsv | 2 +- phylogenetic/example_data/metadata_denv1.tsv | 2 +- phylogenetic/example_data/metadata_denv2.tsv | 2 +- phylogenetic/example_data/metadata_denv3.tsv | 2 +- phylogenetic/example_data/metadata_denv4.tsv | 2 +- phylogenetic/rules/export.smk | 3 ++- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/phylogenetic/defaults/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml index b861686e..1e5854e6 100644 --- a/phylogenetic/defaults/config_dengue.yaml +++ b/phylogenetic/defaults/config_dengue.yaml @@ -4,7 +4,7 @@ sequences_url: "https://data.nextstrain.org/files/workflows/dengue/sequences_{serotype}.fasta.zst" metadata_url: "https://data.nextstrain.org/files/workflows/dengue/metadata_{serotype}.tsv.zst" -strain_id_field: "genbank_accession" +strain_id_field: "accession" display_strain_field: "strain" filter: diff --git a/phylogenetic/example_data/metadata_all.tsv b/phylogenetic/example_data/metadata_all.tsv index 01a411da..ba8c0c35 100644 --- a/phylogenetic/example_data/metadata_all.tsv +++ b/phylogenetic/example_data/metadata_all.tsv @@ -1,4 +1,4 @@ -strain virus genbank_accession date region country division city db segment authors url title journal paper_url +strain virus accession date region country division city db segment authors url title journal paper_url DENV1/VIETNAM/BIDV1862/2008 dengue FJ410220 2008-XX-XX Southeast Asia Vietnam Vietnam Vietnam ? genome Alvarado et al. ? ? ? ? DENV3/WALLIS_AND_FUTUNA/1404838/1989 dengue JQ920487 1989-08-14 ? Wallis And Futuna French Overseas Collectivity Wallis And Futuna ? genome Aubry et al. ? ? ? ? DENV/BRAZIL/SJRP778/2013 dengue KT438585 2013-02-04 South America Brazil Brazil Brazil ? genome Araujo et al. ? ? ? ? diff --git a/phylogenetic/example_data/metadata_denv1.tsv b/phylogenetic/example_data/metadata_denv1.tsv index 0574a0c7..b697eb54 100644 --- a/phylogenetic/example_data/metadata_denv1.tsv +++ b/phylogenetic/example_data/metadata_denv1.tsv @@ -1,4 +1,4 @@ -strain virus genbank_accession date region country division city db segment authors url title journal paper_url +strain virus accession date region country division city db segment authors url title journal paper_url DENV1/INDIA/237/1962 dengue JF297572 1962-XX-XX South Asia India India India ? genome Cecilia et al. ? ? ? ? DENV1/CHINA/RL6/2013 dengue KJ470727 2013-09-XX China China China China ? genome Guo et al. ? ? ? ? DENV1/THAILAND/AILANDKPPKDV08491NC04528V0L/2007 dengue JQ993108 2007-XX-XX Southeast Asia Thailand Thailand Thailand ? genome Aldstadt et al. ? ? ? ? diff --git a/phylogenetic/example_data/metadata_denv2.tsv b/phylogenetic/example_data/metadata_denv2.tsv index 5c694c00..31f88305 100644 --- a/phylogenetic/example_data/metadata_denv2.tsv +++ b/phylogenetic/example_data/metadata_denv2.tsv @@ -1,4 +1,4 @@ -strain virus genbank_accession date region country division city db segment authors url title journal paper_url +strain virus accession date region country division city db segment authors url title journal paper_url DENV2/INDONESIA/MKS0084/2007 dengue KC762660 2007-11-21 Southeast Asia Indonesia Makassar Makassar ? genome Bifani et al. ? ? ? ? DENV2/TAIWAN/71/2002 dengue EF016253 2002-XX-XX China Taiwan Taiwan Taiwan ? genome Ke et al. ? ? ? ? DENV2/PAKISTAN/8D/2011 dengue JX042512 2011-XX-XX West Asia Pakistan Pakistan Pakistan ? genome Alvi et al. ? ? ? ? diff --git a/phylogenetic/example_data/metadata_denv3.tsv b/phylogenetic/example_data/metadata_denv3.tsv index 18c5b0aa..911e8cef 100644 --- a/phylogenetic/example_data/metadata_denv3.tsv +++ b/phylogenetic/example_data/metadata_denv3.tsv @@ -1,4 +1,4 @@ -strain virus genbank_accession date region country division city db segment authors url title journal paper_url +strain virus accession date region country division city db segment authors url title journal paper_url DENV3/PUERTO_RICO/BIDV1615/2004 dengue FJ373302 2004-XX-XX North America Puerto Rico Puerto Rico Puerto Rico ? genome Alvarado et al. ? ? ? ? DENV3/THAILAND/BIDV2323/2001 dengue FJ744734 2001-XX-XX Southeast Asia Thailand Kamphaeng Phet Kamphaeng Phet ? genome Alvarado et al. ? ? ? ? DENV3/BANGLADESH/JACOB/NA dengue AY656673 ? South Asia Bangladesh Bangladesh Bangladesh ? genome Aaskov et al. ? ? ? ? diff --git a/phylogenetic/example_data/metadata_denv4.tsv b/phylogenetic/example_data/metadata_denv4.tsv index 9d9ce386..d3175b21 100644 --- a/phylogenetic/example_data/metadata_denv4.tsv +++ b/phylogenetic/example_data/metadata_denv4.tsv @@ -1,4 +1,4 @@ -strain virus genbank_accession date region country division city db segment authors url title journal paper_url +strain virus accession date region country division city db segment authors url title journal paper_url DENV4/ECUADOR/FSE2098/2006 dengue GQ139572 2006-XX-XX South America Ecuador Ecuador Ecuador ? genome Alava et al. ? ? ? ? DENV4/BRAZIL/SJRP580/2012 dengue KP188562 2012-12-10 South America Brazil Brazil Brazil ? genome Araujo et al. ? ? ? ? DENV4/BRAZIL/GUSP0670/2013 dengue KP704043 2013-03-17 South America Brazil Brazil Brazil ? genome Balarini et al. ? ? ? ? diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 92211388..3b50f37f 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -103,7 +103,8 @@ rule prepare_auspice_config: "author" ], "metadata_columns": [ - "genbank_accession" + "accession", + "url" ] } From 4acba653960ff03c751f7ff9d3a718010d3e91ec Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 18 Dec 2024 14:04:04 -0800 Subject: [PATCH 07/10] Automatically drop is_lab_host=true records during phylogenetic workflow --- phylogenetic/rules/prepare_sequences.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index ec4157ac..44f42249 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -75,7 +75,8 @@ rule filter: --group-by {params.group_by} \ --sequences-per-group {params.sequences_per_group} \ --min-length {params.min_length} \ - --exclude-where country=? region=? date=? \ + --exclude-where country=? region=? date=? is_lab_host='true' \ + --query-columns is_lab_host:str """ rule align: From 65520e5d097022acfee88ed5fde9d0eef0cdf3d6 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 18 Dec 2024 15:52:41 -0800 Subject: [PATCH 08/10] Add frequencies panels --- phylogenetic/Snakefile | 1 + phylogenetic/defaults/config_dengue.yaml | 6 ++++ phylogenetic/rules/export.smk | 35 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index c1e926f1..633fe057 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -10,6 +10,7 @@ wildcard_constraints: rule all: input: auspice_json = expand("auspice/dengue_{serotype}_{gene}.json", serotype=serotypes, gene=genes), + tip_frequencies_json= expand("auspice/dengue_{serotype}_{gene}_tip-frequencies.json", serotype=serotypes, gene=genes) include: "rules/prepare_sequences.smk" include: "rules/prepare_sequences_E.smk" diff --git a/phylogenetic/defaults/config_dengue.yaml b/phylogenetic/defaults/config_dengue.yaml index 1e5854e6..320bdedf 100644 --- a/phylogenetic/defaults/config_dengue.yaml +++ b/phylogenetic/defaults/config_dengue.yaml @@ -38,5 +38,11 @@ clades: denv3: 'defaults/clades_genotypes.tsv' denv4: 'defaults/clades_genotypes.tsv' +tip_frequencies: + min_date: "1980-01-01" + max_date: "6M" + narrow_bandwidth: 0.2 + wide_bandwidth: 0.6 + export: description: "defaults/description.md" diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 3b50f37f..aa49e686 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -102,6 +102,12 @@ rule prepare_auspice_config: "region", "author" ], + "panels": [ + "tree", + "map", + "entropy", + "frequencies" + ], "metadata_columns": [ "accession", "url" @@ -179,3 +185,32 @@ rule final_strain_name: --display-strain-name {params.display_strain_field} \ --output {output.auspice_json} """ + +rule tip_frequencies: + """ + Estimating KDE frequencies for tips + """ + input: + tree = "results/{gene}/tree_{serotype}.nwk", + metadata = "data/metadata_{serotype}.tsv", + output: + tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json" + params: + strain_id = config["strain_id_field"], + min_date = config["tip_frequencies"]["min_date"], + max_date = config["tip_frequencies"]["max_date"], + narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"], + wide_bandwidth = config["tip_frequencies"]["wide_bandwidth"] + shell: + r""" + augur frequencies \ + --method kde \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --min-date {params.min_date} \ + --max-date {params.max_date} \ + --narrow-bandwidth {params.narrow_bandwidth} \ + --wide-bandwidth {params.wide_bandwidth} \ + --output {output.tip_freq} + """ From 1b6424bd57fa62376b08ccad0979405b78cad374 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 31 Dec 2024 13:10:23 -0800 Subject: [PATCH 09/10] Tip frequency strain ID must match the displayed strain ID in the final auspice JSON file @j23414 initially believed that the Augur frequency function could utilize the 'accession' strain IDs to match the tree.json file. However, @joverlee521 pointed out that the frequencies JSON actually requires matching the final 'strain' IDs as found in the Auspice JSON file. Otherwise Auspice cannot link the strains to their frequencies. Co-authored-by: Jover Lee --- phylogenetic/rules/export.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index aa49e686..c344b50a 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -196,7 +196,7 @@ rule tip_frequencies: output: tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json" params: - strain_id = config["strain_id_field"], + strain_id = config["display_strain_field"], min_date = config["tip_frequencies"]["min_date"], max_date = config["tip_frequencies"]["max_date"], narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"], From b8b2fa34accf8188f6c5bad49000c6bd17e41c48 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 31 Dec 2024 15:10:32 -0800 Subject: [PATCH 10/10] Drop set_final_strain_name.py Remove set_final_strain_name.py so that both the main JSON and the frequencies JSON use the accession as the strain id. This simplifies the augur frequencies call and the "accession" vs "strain" confusion. --- phylogenetic/rules/export.smk | 27 +++---------- phylogenetic/scripts/set_final_strain_name.py | 38 ------------------- 2 files changed, 5 insertions(+), 60 deletions(-) delete mode 100755 phylogenetic/scripts/set_final_strain_name.py diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index c344b50a..cf1a1fd7 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -95,7 +95,8 @@ rule prepare_auspice_config: ], "display_defaults": { "map_triplicate": True, - "color_by": params.replace_clade_key + "color_by": params.replace_clade_key, + "tip_label": "strain" }, "filters": [ "country", @@ -110,6 +111,7 @@ rule prepare_auspice_config: ], "metadata_columns": [ "accession", + "strain", "url" ] } @@ -150,7 +152,7 @@ rule export: auspice_config = "results/defaults/{gene}/auspice_config_{serotype}.json", colors = "results/colors_{serotype}.tsv", output: - auspice_json = "results/{gene}/raw_dengue_{serotype}.json" + auspice_json = "auspice/dengue_{serotype}_{gene}.json" params: strain_id = config.get("strain_id_field", "strain"), shell: @@ -167,25 +169,6 @@ rule export: --output {output.auspice_json} """ -rule final_strain_name: - input: - auspice_json="results/{gene}/raw_dengue_{serotype}.json", - metadata="data/metadata_{serotype}.tsv", - output: - auspice_json="auspice/dengue_{serotype}_{gene}.json" - params: - strain_id=config.get("strain_id_field", "strain"), - display_strain_field=config.get("display_strain_field", "strain"), - shell: - """ - python3 scripts/set_final_strain_name.py \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --input-auspice-json {input.auspice_json} \ - --display-strain-name {params.display_strain_field} \ - --output {output.auspice_json} - """ - rule tip_frequencies: """ Estimating KDE frequencies for tips @@ -196,7 +179,7 @@ rule tip_frequencies: output: tip_freq = "auspice/dengue_{serotype}_{gene}_tip-frequencies.json" params: - strain_id = config["display_strain_field"], + strain_id = config["strain_id_field"], min_date = config["tip_frequencies"]["min_date"], max_date = config["tip_frequencies"]["max_date"], narrow_bandwidth = config["tip_frequencies"]["narrow_bandwidth"], diff --git a/phylogenetic/scripts/set_final_strain_name.py b/phylogenetic/scripts/set_final_strain_name.py deleted file mode 100755 index 08ca9359..00000000 --- a/phylogenetic/scripts/set_final_strain_name.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd -import json, argparse -from augur.io import read_metadata - -def replace_name_recursive(node, lookup): - if node["name"] in lookup: - node["name"] = lookup[node["name"]] - - if "children" in node: - for child in node["children"]: - replace_name_recursive(child, lookup) - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="Swaps out the strain names in the Auspice JSON with the final strain name", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") - parser.add_argument('--metadata', type=str, required=True, help="input data") - parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") - parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") - parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") - args = parser.parse_args() - - metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) - name_lookup = {} - for ri, row in metadata.iterrows(): - strain_id = row.name - name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] - - with open(args.input_auspice_json, 'r') as fh: - data = json.load(fh) - - replace_name_recursive(data['tree'], name_lookup) - - with open(args.output, 'w') as fh: - json.dump(data, fh)