git subrepo pull (merge) ingest/vendored

subrepo: subdir: "ingest/vendored" merged: "c97df23" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "c97df23" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo" commit: "110b9eb"
nextstrain · Sep 1, 2023 · afeec9c · afeec9c
1 parent fb81750
commit afeec9c
Show file tree

Hide file tree

Showing 22 changed files with 580 additions and 13 deletions.
diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc
@@ -0,0 +1,3 @@
+[cram]
+shell = /bin/bash
+indent = 2
diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml
@@ -11,3 +11,11 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: nextstrain/.github/actions/shellcheck@master
+
+  cram:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+      - run: pip install cram
+      - run: cram tests/
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/nextstrain/ingest
 	branch = main
-	commit = 1eb8b30428d5f66adac201f0a246a7ab4bdc9792
-	parent = a7ccb51fecede6336ef06ec2a980db27aa5821b4
+	commit = c97df238518171c2b1574bec0349a55855d1e7a7
+	parent = fb8175010a16294954d4d4a499edc137bede8265
 	method = merge
 	cmdver = 0.4.6
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
@@ -69,6 +69,14 @@ Scripts for supporting ingest workflow automation that don’t really belong in
 - [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message`
   A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated.
 
+NCBI interaction scripts that are useful for fetching public metadata and sequences.
+
+- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file.
+  Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs.
+- [fetch-from-ncbi-virus](fetch-from-ncbi-virus) - Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) and output NDJSON records to stdout.
+- [ncbi-virus-url](ncbi-virus-url) - Generates the URL to download metadata and sequences from NCBI Virus as a single CSV file.
+- [csv-to-ndjson](csv-to-ndjson) - Converts CSV file to NDJSON file with a hard-coded 200MiB field size limit to accommodate sequences in the NCBI Virus download.
+
 Potential Nextstrain CLI scripts
 
 - [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.
@@ -89,3 +97,16 @@ Potential augur curate scripts
 - [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
 - [transform-field-names](transform-field-names) - Rename fields of NDJSON records
 - [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
+
+## Software requirements
+
+Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date.
+
+## Testing
+
+Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack.
+
+For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally,
+
+1. Download Cram: `pip install cram`
+2. Run the tests: `cram tests/`
diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
 set -euo pipefail
 

diff --git a/ingest/vendored/csv-to-ndjson b/ingest/vendored/csv-to-ndjson
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""
+Convert CSV on stdin to NDJSON on stdout.
+usage: `cat dummy.csv | ./csv-to-ndjson > dummy.ndjson`
+"""
+import csv
+import json
+from sys import stdin, stdout
+
+# 200 MiB; default is 128 KiB
+csv.field_size_limit(200 * 1024 * 1024)
+
+for row in csv.DictReader(stdin):
+    json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
+    print()
diff --git a/ingest/vendored/docs/ncbi-virus-all-fields-example.json b/ingest/vendored/docs/ncbi-virus-all-fields-example.json
@@ -0,0 +1,292 @@
+{
+    "ExportDate_dt": "2023-08-08T21:02:01.475Z",
+    "QualNum_i": 0,
+    "QualPct_d": 0.0,
+    "IncompleteCdsCnt_i": 0,
+    "gi_l": 1798174254,
+    "Host_s": "Homo sapiens",
+    "HostSpecies_s": "Homo sapiens (human), taxid:9606|",
+    "HostLineage_ss": [
+        "cellular organisms, taxid:131567| biota",
+        "Eukaryota (eucaryotes), taxid:2759| eukaryotes Eucarya Eucaryotae Eukarya Eukaryotae",
+        "Opisthokonta, taxid:33154| Fungi/Metazoa group opisthokonts",
+        "Metazoa (metazoans), taxid:33208| multicellular animals Animalia animals",
+        "Eumetazoa, taxid:6072|",
+        "Bilateria, taxid:33213|",
+        "Deuterostomia (deuterostomes), taxid:33511|",
+        "Chordata (chordates), taxid:7711|",
+        "Craniata, taxid:89593|",
+        "Vertebrata (vertebrates), taxid:7742|",
+        "Gnathostomata (jawed vertebrates), taxid:7776|",
+        "Teleostomi, taxid:117570|",
+        "Euteleostomi (bony vertebrates), taxid:117571|",
+        "Sarcopterygii, taxid:8287|",
+        "Dipnotetrapodomorpha, taxid:1338369|",
+        "Tetrapoda (tetrapods), taxid:32523|",
+        "Amniota (amniotes), taxid:32524|",
+        "Mammalia (mammals), taxid:40674|",
+        "Theria, taxid:32525|",
+        "Eutheria (placentals), taxid:9347| eutherian mammals placental mammals Placentalia",
+        "Boreoeutheria, taxid:1437010| Boreotheria",
+        "Euarchontoglires, taxid:314146|",
+        "Primates, taxid:9443| Primata primates",
+        "Haplorrhini, taxid:376913|",
+        "Simiiformes, taxid:314293| Anthropoidea",
+        "Catarrhini, taxid:9526|",
+        "Hominoidea (apes), taxid:314295| ape",
+        "Hominidae (great apes), taxid:9604| Pongidae",
+        "Homininae, taxid:207598| Homo/Pan/Gorilla group",
+        "Homo (humans), taxid:9605|",
+        "Homo sapiens (human), taxid:9606|"
+    ],
+    "HostLineageId_ss": [
+        "131567",
+        "2759",
+        "33154",
+        "33208",
+        "6072",
+        "33213",
+        "33511",
+        "7711",
+        "89593",
+        "7742",
+        "7776",
+        "117570",
+        "117571",
+        "8287",
+        "1338369",
+        "32523",
+        "32524",
+        "40674",
+        "32525",
+        "9347",
+        "1437010",
+        "314146",
+        "9443",
+        "376913",
+        "314293",
+        "9526",
+        "314295",
+        "9604",
+        "207598",
+        "9605",
+        "9606"
+    ],
+    "Locus_s": "NC_045512",
+    "OrgId_i": 2697049,
+    "VirusFamily_s": "Coronaviridae",
+    "VirusGenus_s": "Betacoronavirus",
+    "VirusSpecies_s": "Severe acute respiratory syndrome-related coronavirus",
+    "VirusSpeciesId_i": 694009,
+    "VirusLineage_ss": [
+        "Viruses, taxid:10239| Vira Viridae viruses",
+        "Riboviria (RNA viruses), taxid:2559587| RNA viruses and viroids",
+        "Orthornavirae, taxid:2732396|",
+        "Pisuviricota, taxid:2732408|",
+        "Pisoniviricetes, taxid:2732506|",
+        "Nidovirales, taxid:76804|",
+        "Cornidovirineae, taxid:2499399|",
+        "Coronaviridae, taxid:11118|",
+        "Orthocoronavirinae, taxid:2501931|",
+        "Betacoronavirus, taxid:694002| Coronavirus",
+        "Sarbecovirus, taxid:2509511|",
+        "Severe acute respiratory syndrome-related coronavirus, taxid:694009| HCoV-SARS SARS SARSr-CoV SARSrCoV",
+        "Severe acute respiratory syndrome coronavirus 2, taxid:2697049| SARS-CoV-2",
+        "RNA viruses"
+    ],
+    "VirusLineageId_ss": [
+        "10239",
+        "2559587",
+        "2732396",
+        "2732408",
+        "2732506",
+        "76804",
+        "2499399",
+        "11118",
+        "2501931",
+        "694002",
+        "2509511",
+        "694009",
+        "2697049"
+    ],
+    "VirusL0_s": "RNA viruses",
+    "VirusL1_s": "Orthornavirae, taxid:2732396",
+    "VirusL2_s": "Pisuviricota, taxid:2732408",
+    "VirusL3_s": "Pisoniviricetes, taxid:2732506",
+    "VirusL4_s": "Nidovirales, taxid:76804",
+    "VirusL5_s": "Cornidovirineae, taxid:2499399",
+    "VirusL6_s": "Coronaviridae, taxid:11118",
+    "VirusL7_s": "Orthocoronavirinae, taxid:2501931",
+    "VirusL8_s": "Betacoronavirus, taxid:694002",
+    "VirusL9_s": "Sarbecovirus, taxid:2509511",
+    "VirusL10_s": "Severe acute respiratory syndrome-related coronavirus, taxid:694009",
+    "ViralHost_ss": [
+        "human",
+        "vertebrates"
+    ],
+    "GenomicMoltype_s": "ssRNA(+)",
+    "SLen_i": 29903,
+    "Flags_ss": [
+        "refseq",
+        "complete"
+    ],
+    "Flags_csv": "refseq, complete",
+    "FlagsCount_i": 2,
+    "SetAcc_s": "GCF_009858895.2",
+    "Authors_ss": [
+        "Wu,F.",
+        "Zhao,S.",
+        "Yu,B.",
+        "Chen,Y.M.",
+        "Wang,W.",
+        "Song,Z.G.",
+        "Hu,Y.",
+        "Tao,Z.W.",
+        "Tian,J.H.",
+        "Pei,Y.Y.",
+        "Yuan,M.L.",
+        "Zhang,Y.L.",
+        "Dai,F.H.",
+        "Liu,Y.",
+        "Wang,Q.M.",
+        "Zheng,J.J.",
+        "Xu,L.",
+        "Holmes,E.C.",
+        "Zhang,Y.Z.",
+        "Baranov,P.V.",
+        "Henderson,C.M.",
+        "Anderson,C.B.",
+        "Gesteland,R.F.",
+        "Atkins,J.F.",
+        "Howard,M.T.",
+        "Robertson,M.P.",
+        "Igel,H.",
+        "Baertsch,R.",
+        "Haussler,D.",
+        "Ares,M. Jr.",
+        "Scott,W.G.",
+        "Williams,G.D.",
+        "Chang,R.Y.",
+        "Brian,D.A.",
+        "Chen,Y.-M.",
+        "Song,Z.-G.",
+        "Tao,Z.-W.",
+        "Tian,J.-H.",
+        "Pei,Y.-Y.",
+        "Zhang,Y.-L.",
+        "Dai,F.-H.",
+        "Wang,Q.-M.",
+        "Zheng,J.-J.",
+        "Zhang,Y.-Z."
+    ],
+    "Authors_csv": "Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C., Zhang,Y.Z., Baranov,P.V., Henderson,C.M., Anderson,C.B., Gesteland,R.F., Atkins,J.F., Howard,M.T., Robertson,M.P., Igel,H., Baertsch,R., Haussler,D., Ares,M. Jr., Scott,W.G., Williams,G.D., Chang,R.Y., Brian,D.A., Chen,Y.-M., Song,Z.-G., Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Zhang,Y.-L., Dai,F.-H., Wang,Q.-M., Zheng,J.-J., Zhang,Y.-Z.",
+    "AuthorsCount_i": 44,
+    "Country_s": "China",
+    "Isolate_s": "Wuhan-Hu-1",
+    "Lineage_s": "B",
+    "Division_s": "VRL",
+    "Keywords_ss": [
+        "RefSeq"
+    ],
+    "KeywordsCount_i": 1,
+    "TaxName_s": "Severe acute respiratory syndrome coronavirus 2",
+    "Region_s": "Asia",
+    "ParentAcc_s": "set:NC_045512",
+    "SetPosition_i": 0,
+    "SourceDB_s": "RefSeq",
+    "Definition_s": "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome",
+    "HostId_i": 9606,
+    "CreateDate_dt": "2020-01-13T00:00:00Z",
+    "CreateYear_i": 2020,
+    "Genome_js": "[{\"id\": \"NC_045512.2\", \"segment\": null, \"proteins\": [{\"id\": \"YP_009724389.1\", \"name\": \"ORF1ab polyprotein\", \"location\": \"join(266..13468,13468..21555)\"}, {\"id\": \"YP_009725295.1\", \"name\": \"ORF1a polyprotein\", \"location\": \"266..13483\"}, {\"id\": \"YP_009724390.1\", \"name\": \"surface glycoprotein\", \"location\": \"21563..25384\"}, {\"id\": \"YP_009724391.1\", \"name\": \"ORF3a protein\", \"location\": \"25393..26220\"}, {\"id\": \"YP_009724392.1\", \"name\": \"envelope protein\", \"location\": \"26245..26472\"}, {\"id\": \"YP_009724393.1\", \"name\": \"membrane glycoprotein\", \"location\": \"26523..27191\"}, {\"id\": \"YP_009724394.1\", \"name\": \"ORF6 protein\", \"location\": \"27202..27387\"}, {\"id\": \"YP_009724395.1\", \"name\": \"ORF7a protein\", \"location\": \"27394..27759\"}, {\"id\": \"YP_009725318.1\", \"name\": \"ORF7b\", \"location\": \"27756..27887\"}, {\"id\": \"YP_009724396.1\", \"name\": \"ORF8 protein\", \"location\": \"27894..28259\"}, {\"id\": \"YP_009724397.2\", \"name\": \"nucleocapsid phosphoprotein\", \"location\": \"28274..29533\"}, {\"id\": \"YP_009725255.1\", \"name\": \"ORF10 protein\", \"location\": \"29558..29674\"}]}]",
+    "MolType_s": "RNA",
+    "ProtAcc_ss": [
+        "YP_009724389",
+        "YP_009725295",
+        "YP_009724390",
+        "YP_009724391",
+        "YP_009724392",
+        "YP_009724393",
+        "YP_009724394",
+        "YP_009724395",
+        "YP_009725318",
+        "YP_009724396",
+        "YP_009724397",
+        "YP_009725255"
+    ],
+    "ProtAccCount_i": 12,
+    "UpdateDate_dt": "2020-07-18T00:00:00Z",
+    "UpdateYear_i": 2020,
+    "PubMed_ss": [
+        "32015508",
+        "15680415",
+        "15630477",
+        "10482585"
+    ],
+    "PubMed_csv": "32015508, 15680415, 15630477, 10482585",
+    "PubMedCount_i": 4,
+    "Completeness_s": "complete",
+    "CountryFull_s": "China",
+    "ProtNames_ss": [
+        "ORF1ab polyprotein",
+        "ORF1a polyprotein",
+        "surface glycoprotein",
+        "ORF3a protein",
+        "envelope protein",
+        "membrane glycoprotein",
+        "ORF6 protein",
+        "ORF7a protein",
+        "ORF7b protein",
+        "ORF8 protein",
+        "nucleocapsid phosphoprotein",
+        "ORF10 protein"
+    ],
+    "ProtNamesCount_i": 12,
+    "IsolateParsed_s": "Wuhan-Hu-1",
+    "NuclAcc_ss": [
+        "NC_045512"
+    ],
+    "NuclAccCount_i": 1,
+    "CollectionDate_dr": "2019-12",
+    "CollectionYear_i": 2019,
+    "SubmitterAffil_s": "National Center for Biotechnology Information, NIH",
+    "BioProject_ss": [
+        "PRJNA485481"
+    ],
+    "BioProject_csv": "PRJNA485481",
+    "BioProjectCount_i": 1,
+    "AccVer_s": "NC_045512.2",
+    "CollectionDate_s": "2019-12",
+    "SubmitterCountry_s": "USA",
+    "CollectionDate_dt": "2019-12-01T00:00:00Z",
+    "GenomeCompleteness_s": "complete",
+    "SubmitterAffilFull_s": "National Center for Biotechnology Information, NIH",
+    "BioProject_s": "PRJNA485481",
+    "AccNV_s": "NC_045512",
+    "id": "NC_045512",
+    "SeqType_s": "Nucleotide",
+    "FastaMD5_s": "4928f859a1822d291e0225206a0068c8",
+    "live_i": 1,
+    "ids_ss": [
+        "GCF_009858895",
+        "GCF_009858895.2",
+        "NC_045512",
+        "NC_045512.2",
+        "PRJNA485481",
+        "YP_009724389",
+        "YP_009724390",
+        "YP_009724391",
+        "YP_009724392",
+        "YP_009724393",
+        "YP_009724394",
+        "YP_009724395",
+        "YP_009724396",
+        "YP_009724397",
+        "YP_009725255",
+        "YP_009725295",
+        "YP_009725318",
+        "set:NC_045512"
+    ],
+    "gi_i": 1798174254,
+    "_version_": 1773711315042304000
+}
diff --git a/ingest/vendored/download-from-s3 b/ingest/vendored/download-from-s3
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -euo pipefail
 
 bin="$(dirname "$0")"