From a2b3f60e299458e38cb3236507c0f914a412bd79 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 28 Jul 2023 15:30:28 -0700
Subject: [PATCH 1/5] git subrepo clone (merge)
 https://github.com/nextstrain/ingest ingest/vendored

subrepo:
  subdir:   "ingest/vendored"
  merged:   "9082700"
upstream:
  origin:   "https://github.com/nextstrain/ingest"
  branch:   "main"
  commit:   "9082700"
git-subrepo:
  version:  "0.4.6"
  origin:   "https://github.com/ingydotnet/git-subrepo"
  commit:   "110b9eb"
---
 ingest/vendored/.github/workflows/ci.yaml  | 13 +++++
 ingest/vendored/.gitrepo                   | 12 ++++
 ingest/vendored/README.md                  | 60 ++++++++++++++++++++
 ingest/vendored/cloudfront-invalidate      | 42 ++++++++++++++
 ingest/vendored/merge-user-metadata        | 55 ++++++++++++++++++
 ingest/vendored/notify-on-job-fail         | 23 ++++++++
 ingest/vendored/notify-on-job-start        | 27 +++++++++
 ingest/vendored/notify-slack               | 56 ++++++++++++++++++
 ingest/vendored/s3-object-exists           |  8 +++
 ingest/vendored/sha256sum                  | 15 +++++
 ingest/vendored/transform-authors          | 66 ++++++++++++++++++++++
 ingest/vendored/transform-field-names      | 48 ++++++++++++++++
 ingest/vendored/transform-genbank-location | 43 ++++++++++++++
 ingest/vendored/trigger                    | 56 ++++++++++++++++++
 14 files changed, 524 insertions(+)
 create mode 100644 ingest/vendored/.github/workflows/ci.yaml
 create mode 100644 ingest/vendored/.gitrepo
 create mode 100644 ingest/vendored/README.md
 create mode 100755 ingest/vendored/cloudfront-invalidate
 create mode 100755 ingest/vendored/merge-user-metadata
 create mode 100755 ingest/vendored/notify-on-job-fail
 create mode 100755 ingest/vendored/notify-on-job-start
 create mode 100755 ingest/vendored/notify-slack
 create mode 100755 ingest/vendored/s3-object-exists
 create mode 100755 ingest/vendored/sha256sum
 create mode 100755 ingest/vendored/transform-authors
 create mode 100755 ingest/vendored/transform-field-names
 create mode 100755 ingest/vendored/transform-genbank-location
 create mode 100755 ingest/vendored/trigger

diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml
new file mode 100644
index 00000000..dcb3b898
--- /dev/null
+++ b/ingest/vendored/.github/workflows/ci.yaml
@@ -0,0 +1,13 @@
+name: CI
+
+on:
+  - push
+  - pull_request
+  - workflow_dispatch
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: nextstrain/.github/actions/shellcheck@master
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
new file mode 100644
index 00000000..614ef293
--- /dev/null
+++ b/ingest/vendored/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/nextstrain/ingest
+	branch = main
+	commit = 9082700fdbb99007d5e852e657cedf3723d13181
+	parent = 5c461dc7e90cd70c1f16b193f82fd1666d4c95e2
+	method = merge
+	cmdver = 0.4.6
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
new file mode 100644
index 00000000..84b855c1
--- /dev/null
+++ b/ingest/vendored/README.md
@@ -0,0 +1,60 @@
+# ingest
+
+Shared internal tooling for pathogen data ingest.  Used by our individual
+pathogen repos which produce Nextstrain builds.  Expected to be vendored by
+each pathogen repo using `git subtree` (or `git subrepo`).
+
+Some tools may only live here temporarily before finding a permanent home in
+`augur curate` or Nextstrain CLI.  Others may happily live out their days here.
+
+## History
+
+Much of this tooling originated in
+[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru
+[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/).
+It subsequently proliferated from [monkeypox][] to other pathogen repos
+([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily
+thru copying.  To [counter that
+proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079),
+this repo was made.
+
+[monkeypox]: https://github.com/nextstrain/monkeypox
+[rsv]: https://github.com/nextstrain/rsv
+[zika]: https://github.com/nextstrain/zika/pull/24
+[dengue]: https://github.com/nextstrain/dengue/pull/10
+[hepatitisB]: https://github.com/nextstrain/hepatitisB
+[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov
+
+## Elsewhere
+
+The creation of this repo, in both the abstract and concrete, and the general
+approach to "ingest" has been discussed in various internal places, including:
+
+- https://github.com/nextstrain/private/issues/59
+- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i)
+- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079)
+- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit)
+- _…many others_
+
+## Scripts
+
+Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools.
+
+- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch
+- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch
+- [notify-slack](notify-slack) - Send message or file to Slack
+- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts
+- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events.
+
+Potential Nextstrain CLI scripts
+
+- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.
+- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104).
+  This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script.
+
+Potential augur curate scripts
+
+- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records
+- [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
+- [transform-field-names](transform-field-names) - Rename fields of NDJSON records
+- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate
new file mode 100755
index 00000000..dec48529
--- /dev/null
+++ b/ingest/vendored/cloudfront-invalidate
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
+set -euo pipefail
+
+main() {
+    local domain="$1"
+    shift
+    local paths=("$@")
+    local distribution invalidation
+
+    echo "-> Finding CloudFront distribution"
+    distribution=$(
+        aws cloudfront list-distributions \
+            --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \
+            --output text
+    )
+
+    if [[ -z $distribution || $distribution == None ]]; then
+        exec >&2
+        echo "Unable to find CloudFront distribution id for $domain"
+        echo
+        echo "Are your AWS CLI credentials for the right account?"
+        exit 1
+    fi
+
+    echo "-> Creating CloudFront invalidation for distribution $distribution"
+    invalidation=$(
+        aws cloudfront create-invalidation \
+            --distribution-id "$distribution" \
+            --paths "${paths[@]}" \
+            --query Invalidation.Id \
+            --output text
+    )
+
+    echo "-> Waiting for CloudFront invalidation $invalidation to complete"
+    echo "   Ctrl-C to stop waiting."
+    aws cloudfront wait invalidation-completed \
+        --distribution-id "$distribution" \
+        --id "$invalidation"
+}
+
+main "$@"
diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata
new file mode 100755
index 00000000..341c2dfa
--- /dev/null
+++ b/ingest/vendored/merge-user-metadata
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Merges user curated annotations with the NDJSON records from stdin, with the user
+curations overwriting the existing fields. The modified records are output
+to stdout. This does not do any additional transformations on top of the user
+curations.
+"""
+import argparse
+import csv
+import json
+from collections import defaultdict
+from sys import exit, stdin, stderr, stdout
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--annotations", metavar="TSV", required=True,
+        help="Manually curated annotations TSV file. " +
+             "The TSV should not have a header and should have exactly three columns: " +
+             "id to match existing metadata, field name, and field value. " +
+             "If there are multiple annotations for the same id and field, then the last value is used. " +
+             "Lines starting with '#' are treated as comments. " +
+             "Any '#' after the field value are treated as comments.")
+    parser.add_argument("--id-field", default="accession",
+        help="The ID field in the metadata to use to merge with the annotations.")
+
+    args = parser.parse_args()
+
+    annotations = defaultdict(dict)
+    with open(args.annotations, 'r') as annotations_fh:
+        csv_reader = csv.reader(annotations_fh, delimiter='\t')
+        for row in csv_reader:
+            if not row or row[0].lstrip()[0] == '#':
+                    continue
+            elif len(row) != 3:
+                print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr)
+                continue
+            id, field, value = row
+            annotations[id][field] = value.partition('#')[0].rstrip()
+
+    for record in stdin:
+        record = json.loads(record)
+
+        record_id = record.get(args.id_field)
+        if record_id is None:
+            print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr)
+            exit(1)
+
+        record.update(annotations.get(record_id, {}))
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/ingest/vendored/notify-on-job-fail b/ingest/vendored/notify-on-job-fail
new file mode 100755
index 00000000..02cb6bad
--- /dev/null
+++ b/ingest/vendored/notify-on-job-fail
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+: "${AWS_BATCH_JOB_ID:=}"
+: "${GITHUB_RUN_ID:=}"
+
+bin="$(dirname "$0")"
+job_name="${1:?A job name is required as the first argument}"
+github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}"
+
+echo "Notifying Slack about failed ${job_name} job."
+message="❌ ${job_name} job has FAILED 😞 "
+
+if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
+    message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>) for error details. "
+elif [[ -n "${GITHUB_RUN_ID}" ]]; then
+    message+="See GitHub Action <https://github.com/${github_repo}/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}> for error details. "
+fi
+
+"$bin"/notify-slack "$message"
diff --git a/ingest/vendored/notify-on-job-start b/ingest/vendored/notify-on-job-start
new file mode 100755
index 00000000..3e44bb09
--- /dev/null
+++ b/ingest/vendored/notify-on-job-start
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+: "${AWS_BATCH_JOB_ID:=}"
+: "${GITHUB_RUN_ID:=}"
+
+bin="$(dirname "$0")"
+job_name="${1:?A job name is required as the first argument}"
+github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}"
+build_dir="${3:-ingest}"
+
+echo "Notifying Slack about started ${job_name} job."
+message="${job_name} job has started."
+
+if [[ -n "${GITHUB_RUN_ID}" ]]; then
+  message+=" The job was submitted by GitHub Action <https://github.com/${github_repo}/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}>."
+fi
+
+if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
+  message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>)."
+  message+=" Follow along in your local clone of ${github_repo} with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ${build_dir}"'```'
+fi
+
+"$bin"/notify-slack "$message"
diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack
new file mode 100755
index 00000000..db98bfb8
--- /dev/null
+++ b/ingest/vendored/notify-slack
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
+: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
+
+upload=0
+output=/dev/null
+thread_ts=""
+broadcast=0
+args=()
+
+for arg; do
+    case "$arg" in
+        --upload)
+            upload=1;;
+        --output=*)
+            output="${arg#*=}";;
+        --thread-ts=*)
+            thread_ts="${arg#*=}";;
+        --broadcast)
+            broadcast=1;;
+        *)
+            args+=("$arg");;
+    esac
+done
+
+set -- "${args[@]}"
+
+text="${1:?Some message text is required.}"
+
+if [[ "$upload" == 1 ]]; then
+    echo "Uploading data to Slack with the message: $text"
+    curl https://slack.com/api/files.upload \
+        --header "Authorization: Bearer $SLACK_TOKEN" \
+        --form-string channels="$SLACK_CHANNELS" \
+        --form-string title="$text" \
+        --form-string filename="$text" \
+        --form-string thread_ts="$thread_ts" \
+        --form file=@/dev/stdin \
+        --form filetype=text \
+        --fail --silent --show-error \
+        --http1.1 \
+        --output "$output"
+else
+    echo "Posting Slack message: $text"
+    curl https://slack.com/api/chat.postMessage \
+        --header "Authorization: Bearer $SLACK_TOKEN" \
+        --form-string channel="$SLACK_CHANNELS" \
+        --form-string text="$text" \
+        --form-string thread_ts="$thread_ts" \
+        --form-string reply_broadcast="$broadcast" \
+        --fail --silent --show-error \
+        --http1.1 \
+        --output "$output"
+fi
diff --git a/ingest/vendored/s3-object-exists b/ingest/vendored/s3-object-exists
new file mode 100755
index 00000000..faac4219
--- /dev/null
+++ b/ingest/vendored/s3-object-exists
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euo pipefail
+
+url="${1#s3://}"
+bucket="${url%%/*}"
+key="${url#*/}"
+
+aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null
diff --git a/ingest/vendored/sha256sum b/ingest/vendored/sha256sum
new file mode 100755
index 00000000..32d7ef82
--- /dev/null
+++ b/ingest/vendored/sha256sum
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""
+Portable sha256sum utility.
+"""
+from hashlib import sha256
+from sys import stdin
+
+chunk_size = 5 * 1024**2 # 5 MiB
+
+h = sha256()
+
+for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
+    h.update(chunk)
+
+print(h.hexdigest())
diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors
new file mode 100755
index 00000000..0bade20e
--- /dev/null
+++ b/ingest/vendored/transform-authors
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
+record from stdin and outputs modified records to stdout.
+
+Note: This is a "best effort" approach and can potentially mangle the author name.
+"""
+import argparse
+import json
+import re
+from sys import stderr, stdin, stdout
+
+
+def parse_authors(record: dict, authors_field: str, default_value: str,
+    index: int, abbr_authors_field: str = None) -> dict:
+    # Strip and normalize whitespace
+    new_authors = re.sub(r'\s+', ' ', record[authors_field])
+
+    if new_authors == "":
+        new_authors = default_value
+    else:
+        # Split authors list on comma/semicolon
+        # OR "and"/"&" with at least one space before and after
+        new_authors = re.split(r'(?:\s*[,，;；]\s*|\s+(?:and|&)\s+)', new_authors)[0]
+
+        # if it does not already end with " et al.", add it
+        if not new_authors.strip('. ').endswith(" et al"):
+            new_authors += ' et al'
+
+    if abbr_authors_field:
+        if record.get(abbr_authors_field):
+            print(
+                f"WARNING: the {abbr_authors_field!r} field already exists",
+                f"in record {index} and will be overwritten!",
+                file=stderr
+            )
+
+        record[abbr_authors_field] = new_authors
+    else:
+        record[authors_field] = new_authors
+
+    return record
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--authors-field", default="authors",
+        help="The field containing list of authors.")
+    parser.add_argument("--default-value", default="?",
+        help="Default value to use if authors list is empty.")
+    parser.add_argument("--abbr-authors-field",
+        help="The field for the generated abbreviated authors. " +
+             "If not provided, the original authors field will be modified.")
+
+    args = parser.parse_args()
+
+    for index, record in enumerate(stdin):
+        record = json.loads(record)
+
+        parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names
new file mode 100755
index 00000000..fde223fc
--- /dev/null
+++ b/ingest/vendored/transform-field-names
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Renames fields of the NDJSON record from stdin and outputs modified records
+to stdout.
+"""
+import argparse
+import json
+from sys import stderr, stdin, stdout
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--field-map", nargs="+",
+        help="Fields names in the NDJSON record mapped to new field names, " +
+             "formatted as '{old_field_name}={new_field_name}'. " +
+             "If the old field does not exist in record, the new field will be added with an empty string value." +
+             "If the new field already exists in record, then the renaming of the old field will be skipped.")
+    parser.add_argument("--force", action="store_true",
+        help="Force renaming of old field even if the new field already exists. " +
+             "Please keep in mind this will overwrite the value of the new field.")
+
+    args = parser.parse_args()
+
+    field_map = {}
+    for field in args.field_map:
+        old_name, new_name = field.split('=')
+        field_map[old_name] = new_name
+
+    for record in stdin:
+        record = json.loads(record)
+
+        for old_field, new_field in field_map.items():
+
+            if record.get(new_field) and not args.force:
+                print(
+                    f"WARNING: skipping rename of {old_field} because record",
+                    f"already has a field named {new_field}.",
+                    file=stderr
+                )
+                continue
+
+            record[new_field] = record.pop(old_field, '')
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location
new file mode 100755
index 00000000..70ba56fb
--- /dev/null
+++ b/ingest/vendored/transform-genbank-location
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
+fields: 'country', 'division', and 'location'. Checks that a record is from
+GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
+
+Outputs the modified record to stdout.
+"""
+import json
+from sys import stdin, stdout
+
+
+def parse_location(record: dict) -> dict:
+    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
+    # See GenBank docs for their "country" field:
+    # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
+    geographic_data = record['location'].split(':')
+
+    country = geographic_data[0]
+    division = ''
+    location = ''
+
+    if len(geographic_data) == 2:
+        division , _ , location = geographic_data[1].partition(',')
+
+    record['country'] = country.strip()
+    record['division'] = division.strip()
+    record['location'] = location.strip()
+
+    return record
+
+
+if __name__ == '__main__':
+
+    for record in stdin:
+        record = json.loads(record)
+
+        database = record.get('database', '')
+        if database in {'GenBank', 'RefSeq'}:
+            parse_location(record)
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()
diff --git a/ingest/vendored/trigger b/ingest/vendored/trigger
new file mode 100755
index 00000000..d40553b6
--- /dev/null
+++ b/ingest/vendored/trigger
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+
+: "${PAT_GITHUB_DISPATCH:=}"
+
+repo="${1:?A repository name is required as the first argument.}"
+event_type="${2:?An event type is required as the second argument.}"
+shift 2
+
+if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
+    cat >&2 <<.
+You must specify options to curl for your GitHub credentials.  For example, you
+can specify your GitHub username, and will be prompted for your password:
+
+  $0 $repo $event_type --user <your-github-username>
+
+Be sure to enter a personal access token¹ as your password since GitHub has
+discontinued password authentication to the API starting on November 13, 2020².
+
+You can also store your credentials or a personal access token in a netrc
+file³:
+
+  machine api.github.com
+  login <your-username>
+  password <your-token>
+
+and then tell curl to use it:
+
+  $0 $repo $event_type --netrc
+
+which will then not require you to type your password every time.
+
+¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line
+² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password
+³ https://ec.haxx.se/usingcurl/usingcurl-netrc
+.
+    exit 1
+fi
+
+auth=':'
+if [[ -n $PAT_GITHUB_DISPATCH ]]; then
+  auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
+fi
+
+if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \
+    -H 'Accept: application/vnd.github.v3+json' \
+    -H 'Content-Type: application/json' \
+    -H "$auth" \
+    -d '{"event_type":"'"$event_type"'"}' \
+    "$@"
+then
+    echo "Successfully triggered $event_type"
+else
+    echo "Request failed" >&2
+    exit 1
+fi

From 9104b11b020fbb5d6a4bdf7174548117e2f7eebb Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 28 Jul 2023 13:39:39 -0700
Subject: [PATCH 2/5] Describe subrepo setup

The previous commit was created by the following command:

    git subrepo clone https://github.com/nextstrain/ingest ingest/vendored

Add a section in the README on how to use this directory in the future.
---
 ingest/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/ingest/README.md b/ingest/README.md
index a8911985..92bed530 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -85,3 +85,16 @@ These are optional environment variables used in our automated pipeline for prov
 
 GenBank sequences and metadata are fetched via NCBI Virus.
 The exact URL used to fetch data is constructed in `bin/genbank-url`.
+
+## `ingest/vendored`
+
+This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in `ingest/vendored`, from [nextstrain/ingest](https://github.com/nextstrain/ingest). To pull new changes from the central ingest repository, first install `git subrepo`, then run:
+
+```sh
+git subrepo pull ingest/vendored
+```
+
+Changes should not be pushed using `git subrepo push`.
+
+1. For pathogen-specific changes, make them in this repository via a pull request.
+2. For pathogen-agnostic changes, make them on [nextstrain/ingest](https://github.com/nextstrain/ingest) via pull request there, then use `git subrepo pull` to add those changes to this repository.

From 29ddb20b49293239ec28bffb6a91cff7d06b6fbf Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 28 Jul 2023 13:42:48 -0700
Subject: [PATCH 3/5] Use centralized scripts that are functionally identical

Remove the copies in this repo and update references.
---
 .github/workflows/rebuild-all.yaml            |  2 +-
 ingest/bin/cloudfront-invalidate              | 42 ------------
 ingest/bin/download-from-s3                   |  4 +-
 ingest/bin/merge-user-metadata                | 55 ----------------
 ingest/bin/notify-on-diff                     |  3 +-
 ingest/bin/notify-on-record-change            |  3 +-
 ingest/bin/s3-object-exists                   |  9 ---
 ingest/bin/sha256sum                          | 16 -----
 ingest/bin/transform-authors                  | 66 -------------------
 ingest/bin/transform-field-names              | 48 --------------
 ingest/bin/transform-genbank-location         | 43 ------------
 ingest/bin/trigger                            | 56 ----------------
 ingest/bin/trigger-on-new-data                |  3 +-
 ingest/bin/upload-to-s3                       |  5 +-
 ingest/workflow/snakemake_rules/transform.smk |  8 +--
 15 files changed, 16 insertions(+), 347 deletions(-)
 delete mode 100755 ingest/bin/cloudfront-invalidate
 delete mode 100755 ingest/bin/merge-user-metadata
 delete mode 100755 ingest/bin/s3-object-exists
 delete mode 100755 ingest/bin/sha256sum
 delete mode 100755 ingest/bin/transform-authors
 delete mode 100755 ingest/bin/transform-field-names
 delete mode 100755 ingest/bin/transform-genbank-location
 delete mode 100755 ingest/bin/trigger

diff --git a/.github/workflows/rebuild-all.yaml b/.github/workflows/rebuild-all.yaml
index cf7984b8..b687e153 100644
--- a/.github/workflows/rebuild-all.yaml
+++ b/.github/workflows/rebuild-all.yaml
@@ -9,6 +9,6 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - name: Repository Dispatch
-        run: ./ingest/bin/trigger monkeypox rebuild
+        run: ./ingest/vendored/trigger monkeypox rebuild
         env:
           PAT_GITHUB_DISPATCH: ${{ secrets.GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH }}
diff --git a/ingest/bin/cloudfront-invalidate b/ingest/bin/cloudfront-invalidate
deleted file mode 100755
index dec48529..00000000
--- a/ingest/bin/cloudfront-invalidate
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8
-set -euo pipefail
-
-main() {
-    local domain="$1"
-    shift
-    local paths=("$@")
-    local distribution invalidation
-
-    echo "-> Finding CloudFront distribution"
-    distribution=$(
-        aws cloudfront list-distributions \
-            --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \
-            --output text
-    )
-
-    if [[ -z $distribution || $distribution == None ]]; then
-        exec >&2
-        echo "Unable to find CloudFront distribution id for $domain"
-        echo
-        echo "Are your AWS CLI credentials for the right account?"
-        exit 1
-    fi
-
-    echo "-> Creating CloudFront invalidation for distribution $distribution"
-    invalidation=$(
-        aws cloudfront create-invalidation \
-            --distribution-id "$distribution" \
-            --paths "${paths[@]}" \
-            --query Invalidation.Id \
-            --output text
-    )
-
-    echo "-> Waiting for CloudFront invalidation $invalidation to complete"
-    echo "   Ctrl-C to stop waiting."
-    aws cloudfront wait invalidation-completed \
-        --distribution-id "$distribution" \
-        --id "$invalidation"
-}
-
-main "$@"
diff --git a/ingest/bin/download-from-s3 b/ingest/bin/download-from-s3
index 762fe581..99424c77 100755
--- a/ingest/bin/download-from-s3
+++ b/ingest/bin/download-from-s3
@@ -2,7 +2,7 @@
 # Originally copied from nextstrain/ncov-ingest repo
 set -euo pipefail
 
-bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 main() {
     local src="${1:?A source s3:// URL is required as the first argument.}"
@@ -13,7 +13,7 @@ main() {
     local key="${s3path#*/}"
 
     local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    dst_hash="$("$bin/sha256sum" < "$dst" || true)"
+    dst_hash="$("$vendored/sha256sum" < "$dst" || true)"
     src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
 
     echo "[ INFO] Downloading $src → $dst"
diff --git a/ingest/bin/merge-user-metadata b/ingest/bin/merge-user-metadata
deleted file mode 100755
index 341c2dfa..00000000
--- a/ingest/bin/merge-user-metadata
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-"""
-Merges user curated annotations with the NDJSON records from stdin, with the user
-curations overwriting the existing fields. The modified records are output
-to stdout. This does not do any additional transformations on top of the user
-curations.
-"""
-import argparse
-import csv
-import json
-from collections import defaultdict
-from sys import exit, stdin, stderr, stdout
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--annotations", metavar="TSV", required=True,
-        help="Manually curated annotations TSV file. " +
-             "The TSV should not have a header and should have exactly three columns: " +
-             "id to match existing metadata, field name, and field value. " +
-             "If there are multiple annotations for the same id and field, then the last value is used. " +
-             "Lines starting with '#' are treated as comments. " +
-             "Any '#' after the field value are treated as comments.")
-    parser.add_argument("--id-field", default="accession",
-        help="The ID field in the metadata to use to merge with the annotations.")
-
-    args = parser.parse_args()
-
-    annotations = defaultdict(dict)
-    with open(args.annotations, 'r') as annotations_fh:
-        csv_reader = csv.reader(annotations_fh, delimiter='\t')
-        for row in csv_reader:
-            if not row or row[0].lstrip()[0] == '#':
-                    continue
-            elif len(row) != 3:
-                print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr)
-                continue
-            id, field, value = row
-            annotations[id][field] = value.partition('#')[0].rstrip()
-
-    for record in stdin:
-        record = json.loads(record)
-
-        record_id = record.get(args.id_field)
-        if record_id is None:
-            print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr)
-            exit(1)
-
-        record.update(annotations.get(record_id, {}))
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/bin/notify-on-diff b/ingest/bin/notify-on-diff
index c304d6b5..32464801 100755
--- a/ingest/bin/notify-on-diff
+++ b/ingest/bin/notify-on-diff
@@ -6,6 +6,7 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source file is required as the first argument.}"
 dst="${2:?A destination s3:// URL is required as the second argument.}"
@@ -16,7 +17,7 @@ diff="$(mktemp -t diff-XXXXXX)"
 trap "rm -f '$dst_local' '$diff'" EXIT
 
 # if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
+"$vendored"/s3-object-exists "$dst" || exit 0
 
 "$bin"/download-from-s3 "$dst" "$dst_local"
 
diff --git a/ingest/bin/notify-on-record-change b/ingest/bin/notify-on-record-change
index 595835b5..6487fbcb 100755
--- a/ingest/bin/notify-on-record-change
+++ b/ingest/bin/notify-on-record-change
@@ -6,13 +6,14 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 src="${1:?A source ndjson file is required as the first argument.}"
 dst="${2:?A destination ndjson s3:// URL is required as the second argument.}"
 source_name=${3:?A record source name is required as the third argument.}
 
 # if the file is not already present, just exit
-"$bin"/s3-object-exists "$dst" || exit 0
+"$vendored"/s3-object-exists "$dst" || exit 0
 
 s3path="${dst#s3://}"
 bucket="${s3path%%/*}"
diff --git a/ingest/bin/s3-object-exists b/ingest/bin/s3-object-exists
deleted file mode 100755
index d586d0b8..00000000
--- a/ingest/bin/s3-object-exists
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest
-set -euo pipefail
-
-url="${1#s3://}"
-bucket="${url%%/*}"
-key="${url#*/}"
-
-aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null
diff --git a/ingest/bin/sha256sum b/ingest/bin/sha256sum
deleted file mode 100755
index aa05af00..00000000
--- a/ingest/bin/sha256sum
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python3
-# Originally copied from nextstrain/ncov-ingest repo
-"""
-Portable sha256sum utility.
-"""
-from hashlib import sha256
-from sys import stdin
-
-chunk_size = 5 * 1024**2 # 5 MiB
-
-h = sha256()
-
-for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
-    h.update(chunk)
-
-print(h.hexdigest())
diff --git a/ingest/bin/transform-authors b/ingest/bin/transform-authors
deleted file mode 100755
index 0bade20e..00000000
--- a/ingest/bin/transform-authors
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-"""
-Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
-record from stdin and outputs modified records to stdout.
-
-Note: This is a "best effort" approach and can potentially mangle the author name.
-"""
-import argparse
-import json
-import re
-from sys import stderr, stdin, stdout
-
-
-def parse_authors(record: dict, authors_field: str, default_value: str,
-    index: int, abbr_authors_field: str = None) -> dict:
-    # Strip and normalize whitespace
-    new_authors = re.sub(r'\s+', ' ', record[authors_field])
-
-    if new_authors == "":
-        new_authors = default_value
-    else:
-        # Split authors list on comma/semicolon
-        # OR "and"/"&" with at least one space before and after
-        new_authors = re.split(r'(?:\s*[,，;；]\s*|\s+(?:and|&)\s+)', new_authors)[0]
-
-        # if it does not already end with " et al.", add it
-        if not new_authors.strip('. ').endswith(" et al"):
-            new_authors += ' et al'
-
-    if abbr_authors_field:
-        if record.get(abbr_authors_field):
-            print(
-                f"WARNING: the {abbr_authors_field!r} field already exists",
-                f"in record {index} and will be overwritten!",
-                file=stderr
-            )
-
-        record[abbr_authors_field] = new_authors
-    else:
-        record[authors_field] = new_authors
-
-    return record
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--authors-field", default="authors",
-        help="The field containing list of authors.")
-    parser.add_argument("--default-value", default="?",
-        help="Default value to use if authors list is empty.")
-    parser.add_argument("--abbr-authors-field",
-        help="The field for the generated abbreviated authors. " +
-             "If not provided, the original authors field will be modified.")
-
-    args = parser.parse_args()
-
-    for index, record in enumerate(stdin):
-        record = json.loads(record)
-
-        parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/bin/transform-field-names b/ingest/bin/transform-field-names
deleted file mode 100755
index fde223fc..00000000
--- a/ingest/bin/transform-field-names
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-"""
-Renames fields of the NDJSON record from stdin and outputs modified records
-to stdout.
-"""
-import argparse
-import json
-from sys import stderr, stdin, stdout
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--field-map", nargs="+",
-        help="Fields names in the NDJSON record mapped to new field names, " +
-             "formatted as '{old_field_name}={new_field_name}'. " +
-             "If the old field does not exist in record, the new field will be added with an empty string value." +
-             "If the new field already exists in record, then the renaming of the old field will be skipped.")
-    parser.add_argument("--force", action="store_true",
-        help="Force renaming of old field even if the new field already exists. " +
-             "Please keep in mind this will overwrite the value of the new field.")
-
-    args = parser.parse_args()
-
-    field_map = {}
-    for field in args.field_map:
-        old_name, new_name = field.split('=')
-        field_map[old_name] = new_name
-
-    for record in stdin:
-        record = json.loads(record)
-
-        for old_field, new_field in field_map.items():
-
-            if record.get(new_field) and not args.force:
-                print(
-                    f"WARNING: skipping rename of {old_field} because record",
-                    f"already has a field named {new_field}.",
-                    file=stderr
-                )
-                continue
-
-            record[new_field] = record.pop(old_field, '')
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/bin/transform-genbank-location b/ingest/bin/transform-genbank-location
deleted file mode 100755
index 70ba56fb..00000000
--- a/ingest/bin/transform-genbank-location
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
-fields: 'country', 'division', and 'location'. Checks that a record is from
-GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
-
-Outputs the modified record to stdout.
-"""
-import json
-from sys import stdin, stdout
-
-
-def parse_location(record: dict) -> dict:
-    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
-    # See GenBank docs for their "country" field:
-    # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
-    geographic_data = record['location'].split(':')
-
-    country = geographic_data[0]
-    division = ''
-    location = ''
-
-    if len(geographic_data) == 2:
-        division , _ , location = geographic_data[1].partition(',')
-
-    record['country'] = country.strip()
-    record['division'] = division.strip()
-    record['location'] = location.strip()
-
-    return record
-
-
-if __name__ == '__main__':
-
-    for record in stdin:
-        record = json.loads(record)
-
-        database = record.get('database', '')
-        if database in {'GenBank', 'RefSeq'}:
-            parse_location(record)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/bin/trigger b/ingest/bin/trigger
deleted file mode 100755
index d40553b6..00000000
--- a/ingest/bin/trigger
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${PAT_GITHUB_DISPATCH:=}"
-
-repo="${1:?A repository name is required as the first argument.}"
-event_type="${2:?An event type is required as the second argument.}"
-shift 2
-
-if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
-    cat >&2 <<.
-You must specify options to curl for your GitHub credentials.  For example, you
-can specify your GitHub username, and will be prompted for your password:
-
-  $0 $repo $event_type --user <your-github-username>
-
-Be sure to enter a personal access token¹ as your password since GitHub has
-discontinued password authentication to the API starting on November 13, 2020².
-
-You can also store your credentials or a personal access token in a netrc
-file³:
-
-  machine api.github.com
-  login <your-username>
-  password <your-token>
-
-and then tell curl to use it:
-
-  $0 $repo $event_type --netrc
-
-which will then not require you to type your password every time.
-
-¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line
-² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password
-³ https://ec.haxx.se/usingcurl/usingcurl-netrc
-.
-    exit 1
-fi
-
-auth=':'
-if [[ -n $PAT_GITHUB_DISPATCH ]]; then
-  auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
-fi
-
-if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \
-    -H 'Accept: application/vnd.github.v3+json' \
-    -H 'Content-Type: application/json' \
-    -H "$auth" \
-    -d '{"event_type":"'"$event_type"'"}' \
-    "$@"
-then
-    echo "Successfully triggered $event_type"
-else
-    echo "Request failed" >&2
-    exit 1
-fi
diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data
index 760a0187..86f40e2f 100755
--- a/ingest/bin/trigger-on-new-data
+++ b/ingest/bin/trigger-on-new-data
@@ -4,6 +4,7 @@ set -euo pipefail
 : "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}"
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 metadata="${1:?A metadata upload output file is required as the first argument.}"
 sequences="${2:?An sequence FASTA upload output file is required as the second argument.}"
@@ -17,7 +18,7 @@ slack_message=""
 # grep exit status 0 for found match, 1 for no match, 2 if an error occurred
 if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then
     slack_message="Triggering new builds due to updated metadata and/or sequences"
-    "$bin"/trigger "monkeypox" "rebuild"
+    "$vendored"/trigger "monkeypox" "rebuild"
 elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then
     slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files."
 else
diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3
index 78b35edd..2c0394f8 100755
--- a/ingest/bin/upload-to-s3
+++ b/ingest/bin/upload-to-s3
@@ -3,6 +3,7 @@
 set -euo pipefail
 
 bin="$(dirname "$0")"
+vendored="$(dirname "$0")"/../vendored
 
 main() {
     local quiet=0
@@ -26,7 +27,7 @@ main() {
     local key="${s3path#*/}"
 
     local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
-    src_hash="$("$bin/sha256sum" < "$src")"
+    src_hash="$("$vendored/sha256sum" < "$src")"
     dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
 
     if [[ $src_hash != "$dst_hash" ]]; then
@@ -46,7 +47,7 @@ main() {
 
         if [[ -n $cloudfront_domain ]]; then
             echo "Creating CloudFront invalidation for $cloudfront_domain/$key"
-            if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then
+            if ! "$vendored"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then
                 echo "CloudFront invalidation failed, but exiting with success anyway."
             fi
         fi
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index b08fc816..1a1726e2 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -65,7 +65,7 @@ rule transform:
     shell:
         """
         (cat {input.sequences_ndjson} \
-            | ./bin/transform-field-names \
+            | ./vendored/transform-field-names \
                 --field-map {params.field_map} \
             | augur curate normalize-strings \
             | ./bin/transform-strain-names \
@@ -74,18 +74,18 @@ rule transform:
             | ./bin/transform-date-fields \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
-            | ./bin/transform-genbank-location \
+            | ./vendored/transform-genbank-location \
             | ./bin/transform-string-fields \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
                 --abbreviations {params.abbreviations} \
-            | ./bin/transform-authors \
+            | ./vendored/transform-authors \
                 --authors-field {params.authors_field} \
                 --default-value {params.authors_default_value} \
                 --abbr-authors-field {params.abbr_authors_field} \
             | ./bin/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
-            | ./bin/merge-user-metadata \
+            | ./vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \
             | ./bin/ndjson-to-tsv-and-fasta \

From de3f6e939867618bc0e999ebca2587eaaac21582 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 28 Jul 2023 13:50:15 -0700
Subject: [PATCH 4/5] Use centralized Slack notification scripts

Remove the copies in this repo and update references.

Add new positional arguments required by the centralized scripts.
---
 .github/workflows/fetch-and-ingest.yaml       |  2 +-
 bin/notify-on-deploy                          |  4 +-
 bin/notify-on-error                           |  4 +-
 bin/notify-on-start                           |  4 +-
 bin/notify-on-success                         |  4 +-
 ingest/bin/notify-on-diff                     |  4 +-
 ingest/bin/notify-on-job-fail                 | 21 -------
 ingest/bin/notify-on-job-start                | 24 --------
 ingest/bin/notify-on-record-change            |  2 +-
 ingest/bin/notify-slack                       | 58 -------------------
 ingest/bin/trigger-on-new-data                |  2 +-
 ingest/bin/upload-to-s3                       |  2 +-
 .../snakemake_rules/slack_notifications.smk   |  4 +-
 13 files changed, 16 insertions(+), 119 deletions(-)
 delete mode 100755 ingest/bin/notify-on-job-fail
 delete mode 100755 ingest/bin/notify-on-job-start
 delete mode 100755 ingest/bin/notify-slack

diff --git a/.github/workflows/fetch-and-ingest.yaml b/.github/workflows/fetch-and-ingest.yaml
index 22cbf667..5c789677 100644
--- a/.github/workflows/fetch-and-ingest.yaml
+++ b/.github/workflows/fetch-and-ingest.yaml
@@ -73,4 +73,4 @@ jobs:
 
     - name: notify_pipeline_failed
       if: ${{ failure() }}
-      run: ./ingest/bin/notify-on-job-fail
+      run: ./ingest/vendored/notify-on-job-fail Ingest nextstrain/monkeypox
diff --git a/bin/notify-on-deploy b/bin/notify-on-deploy
index 77763e74..e279a340 100755
--- a/bin/notify-on-deploy
+++ b/bin/notify-on-deploy
@@ -5,11 +5,11 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 base="$(realpath "$(dirname "$0")/..")"
-ingest_bin="$base/ingest/bin"
+ingest_vendored="$base/ingest/vendored"
 
 deployment_url="${1:?A deployment url is required as the first argument.}"
 slack_ts_file="${2:?A Slack thread timestamp file is required as the second argument.}"
 
 echo "Notifying Slack about deployed builds."
-"$ingest_bin"/notify-slack "Deployed this build to $deployment_url" \
+"$ingest_vendored"/notify-slack "Deployed this build to $deployment_url" \
     --thread-ts="$(cat "$slack_ts_file")"
diff --git a/bin/notify-on-error b/bin/notify-on-error
index bbf82a91..810ce8b1 100755
--- a/bin/notify-on-error
+++ b/bin/notify-on-error
@@ -8,7 +8,7 @@ set -euo pipefail
 : "${GITHUB_RUN_ID:=}"
 
 base="$(realpath "$(dirname "$0")/..")"
-ingest_bin="$base/ingest/bin"
+ingest_vendored="$base/ingest/vendored"
 
 slack_ts_file="${1:-}"
 
@@ -26,6 +26,6 @@ elif [[ -n "${GITHUB_RUN_ID}" ]]; then
   message+="See GitHub Action <https://github.com/nextstrain/monkeypox/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}> for error details."
 fi
 
-"$ingest_bin"/notify-slack "$message" \
+"$ingest_vendored"/notify-slack "$message" \
   --thread-ts="$thread_ts" \
   --broadcast
diff --git a/bin/notify-on-start b/bin/notify-on-start
index 6042e060..c0695ea4 100755
--- a/bin/notify-on-start
+++ b/bin/notify-on-start
@@ -8,7 +8,7 @@ set -euo pipefail
 : "${GITHUB_RUN_ID:=}"
 
 base="$(realpath "$(dirname "$0")/..")"
-ingest_bin="$base/ingest/bin"
+ingest_vendored="$base/ingest/vendored"
 
 build_name="${1:?A build name is required as the first argument.}"
 slack_ts_output="${2:?A Slack thread timestamp file is required as the second argument}"
@@ -29,7 +29,7 @@ if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
   message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} . "'```'
 fi
 
-"$ingest_bin"/notify-slack "$message" --output="$slack_response"
+"$ingest_vendored"/notify-slack "$message" --output="$slack_response"
 
 echo "Saving Slack thread timestamp to '$slack_ts_output'."
 
diff --git a/bin/notify-on-success b/bin/notify-on-success
index f4b21c37..69b85c8f 100755
--- a/bin/notify-on-success
+++ b/bin/notify-on-success
@@ -5,10 +5,10 @@ set -euo pipefail
 : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
 
 base="$(realpath "$(dirname "$0")/..")"
-ingest_bin="$base/ingest/bin"
+ingest_vendored="$base/ingest/vendored"
 
 slack_ts_file="${1:?A Slack thread timestamp file is required as the first argument.}"
 
 echo "Notifying Slack about successful build."
-"$ingest_bin"/notify-slack "✅ This pipeline has successfully finished 🎉" \
+"$ingest_vendored"/notify-slack "✅ This pipeline has successfully finished 🎉" \
   --thread-ts="$(cat "$slack_ts_file")"
diff --git a/ingest/bin/notify-on-diff b/ingest/bin/notify-on-diff
index 32464801..e401f124 100755
--- a/ingest/bin/notify-on-diff
+++ b/ingest/bin/notify-on-diff
@@ -27,10 +27,10 @@ diff "$dst_local" "$src" > "$diff" || diff_exit_code=$?
 
 if [[ "$diff_exit_code" -eq 1 ]]; then
     echo "Notifying Slack about diff."
-    "$bin"/notify-slack --upload "$src.diff" < "$diff"
+    "$vendored"/notify-slack --upload "$src.diff" < "$diff"
 elif [[ "$diff_exit_code" -gt 1 ]]; then
     echo "Notifying Slack about diff failure"
-    "$bin"/notify-slack "Diff failed for $src"
+    "$vendored"/notify-slack "Diff failed for $src"
 else
     echo "No change in $src."
 fi
diff --git a/ingest/bin/notify-on-job-fail b/ingest/bin/notify-on-job-fail
deleted file mode 100755
index 23d3a926..00000000
--- a/ingest/bin/notify-on-job-fail
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-: "${AWS_BATCH_JOB_ID:=}"
-: "${GITHUB_RUN_ID:=}"
-
-bin="$(dirname "$0")"
-
-echo "Notifying Slack about failed ingest job."
-message="❌ Ingest job has FAILED 😞 "
-
-if [ -n "${AWS_BATCH_JOB_ID}" ]; then
-    message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>) for error details. "
-elif [ -n "${GITHUB_RUN_ID}" ]; then
-    message+="See GitHub Action <https://github.com/nextstrain/monkeypox/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}> for error details. "
-fi
-
-"$bin"/notify-slack "$message"
diff --git a/ingest/bin/notify-on-job-start b/ingest/bin/notify-on-job-start
deleted file mode 100755
index 9410fa38..00000000
--- a/ingest/bin/notify-on-job-start
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-: "${AWS_BATCH_JOB_ID:=}"
-: "${GITHUB_RUN_ID:=}"
-
-bin="$(dirname "$0")"
-
-echo "Notifying Slack about started ingest job."
-message="🐵 Monkeypox ingest job has started."
-
-if [[ -n "${GITHUB_RUN_ID}" ]]; then
-  message+=" The job was submitted by GitHub Action <https://github.com/nextstrain/monkeypox/actions/runs/${GITHUB_RUN_ID}?check_suite_focus=true|${GITHUB_RUN_ID}>."
-fi
-
-if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then
-  message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` (<https://console.aws.amazon.com/batch/v2/home?region=us-east-1#jobs/detail/${AWS_BATCH_JOB_ID}|link>)."
-  message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ingest/"'```'
-fi
-
-"$bin"/notify-slack "$message"
diff --git a/ingest/bin/notify-on-record-change b/ingest/bin/notify-on-record-change
index 6487fbcb..3987a192 100755
--- a/ingest/bin/notify-on-record-change
+++ b/ingest/bin/notify-on-record-change
@@ -52,4 +52,4 @@ fi
 
 slack_message+=" (Total record count: $src_record_count)"
 
-"$bin"/notify-slack "$slack_message"
+"$vendored"/notify-slack "$slack_message"
diff --git a/ingest/bin/notify-slack b/ingest/bin/notify-slack
deleted file mode 100755
index 6ca20dec..00000000
--- a/ingest/bin/notify-slack
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Originally copied from nextstrain/ncov-ingest repo
-set -euo pipefail
-
-: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}"
-: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}"
-
-upload=0
-output=/dev/null
-thread_ts=""
-broadcast=0
-args=()
-
-for arg; do
-    case "$arg" in
-        --upload)
-            upload=1;;
-        --output=*)
-            output="${arg#*=}";;
-        --thread-ts=*)
-            thread_ts="${arg#*=}";;
-        --broadcast)
-            broadcast=1;;
-        *)
-            args+=("$arg");;
-    esac
-done
-
-set -- "${args[@]}"
-
-text="${1:?Some message text is required.}"
-
-if [[ "$upload" == 1 ]]; then
-    echo "Uploading data to Slack with the message: $text"
-    curl https://slack.com/api/files.upload \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channels="$SLACK_CHANNELS" \
-        --form-string title="$text" \
-        --form-string filename="$text" \
-        --form-string thread_ts="$thread_ts" \
-        --form-string reply_broadcast="$broadcast" \
-        --form file=@/dev/stdin \
-        --form filetype=text \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output "$output"
-else
-    echo "Posting Slack message: $text"
-    curl https://slack.com/api/chat.postMessage \
-        --header "Authorization: Bearer $SLACK_TOKEN" \
-        --form-string channel="$SLACK_CHANNELS" \
-        --form-string text="$text" \
-        --form-string thread_ts="$thread_ts" \
-        --form-string reply_broadcast="$broadcast" \
-        --fail --silent --show-error \
-        --http1.1 \
-        --output "$output"
-fi
diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data
index 86f40e2f..97c85ff7 100755
--- a/ingest/bin/trigger-on-new-data
+++ b/ingest/bin/trigger-on-new-data
@@ -26,6 +26,6 @@ else
 fi
 
 
-if ! "$bin"/notify-slack "$slack_message"; then
+if ! "$vendored"/notify-slack "$slack_message"; then
     echo "Notifying Slack failed, but exiting with success anyway."
 fi
diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3
index 2c0394f8..cc867755 100755
--- a/ingest/bin/upload-to-s3
+++ b/ingest/bin/upload-to-s3
@@ -57,7 +57,7 @@ main() {
             exit 0
         fi
 
-        if ! "$bin"/notify-slack "Updated $dst available."; then
+        if ! "$vendored"/notify-slack "Updated $dst available."; then
             echo "Notifying Slack failed, but exiting with success anyway."
         fi
     else
diff --git a/ingest/workflow/snakemake_rules/slack_notifications.smk b/ingest/workflow/snakemake_rules/slack_notifications.smk
index b4a54753..d19b848b 100644
--- a/ingest/workflow/snakemake_rules/slack_notifications.smk
+++ b/ingest/workflow/snakemake_rules/slack_notifications.smk
@@ -48,8 +48,8 @@ rule notify_on_metadata_diff:
 
 
 onstart:
-    shell("./bin/notify-on-job-start")
+    shell("./vendored/notify-on-job-start Ingest nextstrain/monkeypox")
 
 
 onerror:
-    shell("./bin/notify-on-job-fail")
+    shell("./vendored/notify-on-job-fail Ingest nextstrain/monkeypox")

From 61a69be34b14ee9b246cea5dd53442378f640dd3 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 28 Jul 2023 14:37:56 -0700
Subject: [PATCH 5/5] Remove unused bin variable

---
 ingest/bin/trigger-on-new-data | 1 -
 ingest/bin/upload-to-s3        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data
index 97c85ff7..1f3e0bac 100755
--- a/ingest/bin/trigger-on-new-data
+++ b/ingest/bin/trigger-on-new-data
@@ -3,7 +3,6 @@ set -euo pipefail
 
 : "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}"
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 metadata="${1:?A metadata upload output file is required as the first argument.}"
diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3
index cc867755..d913182f 100755
--- a/ingest/bin/upload-to-s3
+++ b/ingest/bin/upload-to-s3
@@ -2,7 +2,6 @@
 # Originally copied from nextstrain/ncov-ingest repo
 set -euo pipefail
 
-bin="$(dirname "$0")"
 vendored="$(dirname "$0")"/../vendored
 
 main() {