From a2b3f60e299458e38cb3236507c0f914a412bd79 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:30:28 -0700 Subject: [PATCH 1/5] git subrepo clone (merge) https://github.com/nextstrain/ingest ingest/vendored subrepo: subdir: "ingest/vendored" merged: "9082700" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "9082700" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo" commit: "110b9eb" --- ingest/vendored/.github/workflows/ci.yaml | 13 +++++ ingest/vendored/.gitrepo | 12 ++++ ingest/vendored/README.md | 60 ++++++++++++++++++++ ingest/vendored/cloudfront-invalidate | 42 ++++++++++++++ ingest/vendored/merge-user-metadata | 55 ++++++++++++++++++ ingest/vendored/notify-on-job-fail | 23 ++++++++ ingest/vendored/notify-on-job-start | 27 +++++++++ ingest/vendored/notify-slack | 56 ++++++++++++++++++ ingest/vendored/s3-object-exists | 8 +++ ingest/vendored/sha256sum | 15 +++++ ingest/vendored/transform-authors | 66 ++++++++++++++++++++++ ingest/vendored/transform-field-names | 48 ++++++++++++++++ ingest/vendored/transform-genbank-location | 43 ++++++++++++++ ingest/vendored/trigger | 56 ++++++++++++++++++ 14 files changed, 524 insertions(+) create mode 100644 ingest/vendored/.github/workflows/ci.yaml create mode 100644 ingest/vendored/.gitrepo create mode 100644 ingest/vendored/README.md create mode 100755 ingest/vendored/cloudfront-invalidate create mode 100755 ingest/vendored/merge-user-metadata create mode 100755 ingest/vendored/notify-on-job-fail create mode 100755 ingest/vendored/notify-on-job-start create mode 100755 ingest/vendored/notify-slack create mode 100755 ingest/vendored/s3-object-exists create mode 100755 ingest/vendored/sha256sum create mode 100755 ingest/vendored/transform-authors create mode 100755 ingest/vendored/transform-field-names create mode 100755 ingest/vendored/transform-genbank-location create mode 100755 ingest/vendored/trigger diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml new file mode 100644 index 00000000..dcb3b898 --- /dev/null +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -0,0 +1,13 @@ +name: CI + +on: + - push + - pull_request + - workflow_dispatch + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: nextstrain/.github/actions/shellcheck@master diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo new file mode 100644 index 00000000..614ef293 --- /dev/null +++ b/ingest/vendored/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = https://github.com/nextstrain/ingest + branch = main + commit = 9082700fdbb99007d5e852e657cedf3723d13181 + parent = 5c461dc7e90cd70c1f16b193f82fd1666d4c95e2 + method = merge + cmdver = 0.4.6 diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md new file mode 100644 index 00000000..84b855c1 --- /dev/null +++ b/ingest/vendored/README.md @@ -0,0 +1,60 @@ +# ingest + +Shared internal tooling for pathogen data ingest. Used by our individual +pathogen repos which produce Nextstrain builds. Expected to be vendored by +each pathogen repo using `git subtree` (or `git subrepo`). + +Some tools may only live here temporarily before finding a permanent home in +`augur curate` or Nextstrain CLI. Others may happily live out their days here. + +## History + +Much of this tooling originated in +[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru +[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/). +It subsequently proliferated from [monkeypox][] to other pathogen repos +([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily +thru copying. To [counter that +proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079), +this repo was made. + +[monkeypox]: https://github.com/nextstrain/monkeypox +[rsv]: https://github.com/nextstrain/rsv +[zika]: https://github.com/nextstrain/zika/pull/24 +[dengue]: https://github.com/nextstrain/dengue/pull/10 +[hepatitisB]: https://github.com/nextstrain/hepatitisB +[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov + +## Elsewhere + +The creation of this repo, in both the abstract and concrete, and the general +approach to "ingest" has been discussed in various internal places, including: + +- https://github.com/nextstrain/private/issues/59 +- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i) +- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079) +- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit) +- _…many others_ + +## Scripts + +Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools. + +- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch +- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch +- [notify-slack](notify-slack) - Send message or file to Slack +- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts +- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events. + +Potential Nextstrain CLI scripts + +- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. +- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104). + This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script. + +Potential augur curate scripts + +- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records +- [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' +- [transform-field-names](transform-field-names) - Rename fields of NDJSON records +- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate new file mode 100755 index 00000000..dec48529 --- /dev/null +++ b/ingest/vendored/cloudfront-invalidate @@ -0,0 +1,42 @@ +#!/bin/bash +# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 +set -euo pipefail + +main() { + local domain="$1" + shift + local paths=("$@") + local distribution invalidation + + echo "-> Finding CloudFront distribution" + distribution=$( + aws cloudfront list-distributions \ + --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ + --output text + ) + + if [[ -z $distribution || $distribution == None ]]; then + exec >&2 + echo "Unable to find CloudFront distribution id for $domain" + echo + echo "Are your AWS CLI credentials for the right account?" + exit 1 + fi + + echo "-> Creating CloudFront invalidation for distribution $distribution" + invalidation=$( + aws cloudfront create-invalidation \ + --distribution-id "$distribution" \ + --paths "${paths[@]}" \ + --query Invalidation.Id \ + --output text + ) + + echo "-> Waiting for CloudFront invalidation $invalidation to complete" + echo " Ctrl-C to stop waiting." + aws cloudfront wait invalidation-completed \ + --distribution-id "$distribution" \ + --id "$invalidation" +} + +main "$@" diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata new file mode 100755 index 00000000..341c2dfa --- /dev/null +++ b/ingest/vendored/merge-user-metadata @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Merges user curated annotations with the NDJSON records from stdin, with the user +curations overwriting the existing fields. The modified records are output +to stdout. This does not do any additional transformations on top of the user +curations. +""" +import argparse +import csv +import json +from collections import defaultdict +from sys import exit, stdin, stderr, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--annotations", metavar="TSV", required=True, + help="Manually curated annotations TSV file. " + + "The TSV should not have a header and should have exactly three columns: " + + "id to match existing metadata, field name, and field value. " + + "If there are multiple annotations for the same id and field, then the last value is used. " + + "Lines starting with '#' are treated as comments. " + + "Any '#' after the field value are treated as comments.") + parser.add_argument("--id-field", default="accession", + help="The ID field in the metadata to use to merge with the annotations.") + + args = parser.parse_args() + + annotations = defaultdict(dict) + with open(args.annotations, 'r') as annotations_fh: + csv_reader = csv.reader(annotations_fh, delimiter='\t') + for row in csv_reader: + if not row or row[0].lstrip()[0] == '#': + continue + elif len(row) != 3: + print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) + continue + id, field, value = row + annotations[id][field] = value.partition('#')[0].rstrip() + + for record in stdin: + record = json.loads(record) + + record_id = record.get(args.id_field) + if record_id is None: + print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) + exit(1) + + record.update(annotations.get(record_id, {})) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/notify-on-job-fail b/ingest/vendored/notify-on-job-fail new file mode 100755 index 00000000..02cb6bad --- /dev/null +++ b/ingest/vendored/notify-on-job-fail @@ -0,0 +1,23 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" + +echo "Notifying Slack about failed ${job_name} job." +message="❌ ${job_name} job has FAILED 😞 " + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` () for error details. " +elif [[ -n "${GITHUB_RUN_ID}" ]]; then + message+="See GitHub Action for error details. " +fi + +"$bin"/notify-slack "$message" diff --git a/ingest/vendored/notify-on-job-start b/ingest/vendored/notify-on-job-start new file mode 100755 index 00000000..3e44bb09 --- /dev/null +++ b/ingest/vendored/notify-on-job-start @@ -0,0 +1,27 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +: "${AWS_BATCH_JOB_ID:=}" +: "${GITHUB_RUN_ID:=}" + +bin="$(dirname "$0")" +job_name="${1:?A job name is required as the first argument}" +github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" +build_dir="${3:-ingest}" + +echo "Notifying Slack about started ${job_name} job." +message="${job_name} job has started." + +if [[ -n "${GITHUB_RUN_ID}" ]]; then + message+=" The job was submitted by GitHub Action ." +fi + +if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then + message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` ()." + message+=" Follow along in your local clone of ${github_repo} with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ${build_dir}"'```' +fi + +"$bin"/notify-slack "$message" diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack new file mode 100755 index 00000000..db98bfb8 --- /dev/null +++ b/ingest/vendored/notify-slack @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +upload=0 +output=/dev/null +thread_ts="" +broadcast=0 +args=() + +for arg; do + case "$arg" in + --upload) + upload=1;; + --output=*) + output="${arg#*=}";; + --thread-ts=*) + thread_ts="${arg#*=}";; + --broadcast) + broadcast=1;; + *) + args+=("$arg");; + esac +done + +set -- "${args[@]}" + +text="${1:?Some message text is required.}" + +if [[ "$upload" == 1 ]]; then + echo "Uploading data to Slack with the message: $text" + curl https://slack.com/api/files.upload \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channels="$SLACK_CHANNELS" \ + --form-string title="$text" \ + --form-string filename="$text" \ + --form-string thread_ts="$thread_ts" \ + --form file=@/dev/stdin \ + --form filetype=text \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +else + echo "Posting Slack message: $text" + curl https://slack.com/api/chat.postMessage \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channel="$SLACK_CHANNELS" \ + --form-string text="$text" \ + --form-string thread_ts="$thread_ts" \ + --form-string reply_broadcast="$broadcast" \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" +fi diff --git a/ingest/vendored/s3-object-exists b/ingest/vendored/s3-object-exists new file mode 100755 index 00000000..faac4219 --- /dev/null +++ b/ingest/vendored/s3-object-exists @@ -0,0 +1,8 @@ +#!/bin/bash +set -euo pipefail + +url="${1#s3://}" +bucket="${url%%/*}" +key="${url#*/}" + +aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/ingest/vendored/sha256sum b/ingest/vendored/sha256sum new file mode 100755 index 00000000..32d7ef82 --- /dev/null +++ b/ingest/vendored/sha256sum @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +Portable sha256sum utility. +""" +from hashlib import sha256 +from sys import stdin + +chunk_size = 5 * 1024**2 # 5 MiB + +h = sha256() + +for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): + h.update(chunk) + +print(h.hexdigest()) diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors new file mode 100755 index 00000000..0bade20e --- /dev/null +++ b/ingest/vendored/transform-authors @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +""" +Abbreviates a full list of authors to be ' et al.' of the NDJSON +record from stdin and outputs modified records to stdout. + +Note: This is a "best effort" approach and can potentially mangle the author name. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +def parse_authors(record: dict, authors_field: str, default_value: str, + index: int, abbr_authors_field: str = None) -> dict: + # Strip and normalize whitespace + new_authors = re.sub(r'\s+', ' ', record[authors_field]) + + if new_authors == "": + new_authors = default_value + else: + # Split authors list on comma/semicolon + # OR "and"/"&" with at least one space before and after + new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] + + # if it does not already end with " et al.", add it + if not new_authors.strip('. ').endswith(" et al"): + new_authors += ' et al' + + if abbr_authors_field: + if record.get(abbr_authors_field): + print( + f"WARNING: the {abbr_authors_field!r} field already exists", + f"in record {index} and will be overwritten!", + file=stderr + ) + + record[abbr_authors_field] = new_authors + else: + record[authors_field] = new_authors + + return record + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--authors-field", default="authors", + help="The field containing list of authors.") + parser.add_argument("--default-value", default="?", + help="Default value to use if authors list is empty.") + parser.add_argument("--abbr-authors-field", + help="The field for the generated abbreviated authors. " + + "If not provided, the original authors field will be modified.") + + args = parser.parse_args() + + for index, record in enumerate(stdin): + record = json.loads(record) + + parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names new file mode 100755 index 00000000..fde223fc --- /dev/null +++ b/ingest/vendored/transform-field-names @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Renames fields of the NDJSON record from stdin and outputs modified records +to stdout. +""" +import argparse +import json +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--field-map", nargs="+", + help="Fields names in the NDJSON record mapped to new field names, " + + "formatted as '{old_field_name}={new_field_name}'. " + + "If the old field does not exist in record, the new field will be added with an empty string value." + + "If the new field already exists in record, then the renaming of the old field will be skipped.") + parser.add_argument("--force", action="store_true", + help="Force renaming of old field even if the new field already exists. " + + "Please keep in mind this will overwrite the value of the new field.") + + args = parser.parse_args() + + field_map = {} + for field in args.field_map: + old_name, new_name = field.split('=') + field_map[old_name] = new_name + + for record in stdin: + record = json.loads(record) + + for old_field, new_field in field_map.items(): + + if record.get(new_field) and not args.force: + print( + f"WARNING: skipping rename of {old_field} because record", + f"already has a field named {new_field}.", + file=stderr + ) + continue + + record[new_field] = record.pop(old_field, '') + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location new file mode 100755 index 00000000..70ba56fb --- /dev/null +++ b/ingest/vendored/transform-genbank-location @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate +fields: 'country', 'division', and 'location'. Checks that a record is from +GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". + +Outputs the modified record to stdout. +""" +import json +from sys import stdin, stdout + + +def parse_location(record: dict) -> dict: + # Expected pattern for the location field is "[:][, ]" + # See GenBank docs for their "country" field: + # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ + geographic_data = record['location'].split(':') + + country = geographic_data[0] + division = '' + location = '' + + if len(geographic_data) == 2: + division , _ , location = geographic_data[1].partition(',') + + record['country'] = country.strip() + record['division'] = division.strip() + record['location'] = location.strip() + + return record + + +if __name__ == '__main__': + + for record in stdin: + record = json.loads(record) + + database = record.get('database', '') + if database in {'GenBank', 'RefSeq'}: + parse_location(record) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() diff --git a/ingest/vendored/trigger b/ingest/vendored/trigger new file mode 100755 index 00000000..d40553b6 --- /dev/null +++ b/ingest/vendored/trigger @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail + +: "${PAT_GITHUB_DISPATCH:=}" + +repo="${1:?A repository name is required as the first argument.}" +event_type="${2:?An event type is required as the second argument.}" +shift 2 + +if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then + cat >&2 <<. +You must specify options to curl for your GitHub credentials. For example, you +can specify your GitHub username, and will be prompted for your password: + + $0 $repo $event_type --user + +Be sure to enter a personal access token¹ as your password since GitHub has +discontinued password authentication to the API starting on November 13, 2020². + +You can also store your credentials or a personal access token in a netrc +file³: + + machine api.github.com + login + password + +and then tell curl to use it: + + $0 $repo $event_type --netrc + +which will then not require you to type your password every time. + +¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line +² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password +³ https://ec.haxx.se/usingcurl/usingcurl-netrc +. + exit 1 +fi + +auth=':' +if [[ -n $PAT_GITHUB_DISPATCH ]]; then + auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" +fi + +if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \ + -H 'Accept: application/vnd.github.v3+json' \ + -H 'Content-Type: application/json' \ + -H "$auth" \ + -d '{"event_type":"'"$event_type"'"}' \ + "$@" +then + echo "Successfully triggered $event_type" +else + echo "Request failed" >&2 + exit 1 +fi From 9104b11b020fbb5d6a4bdf7174548117e2f7eebb Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 28 Jul 2023 13:39:39 -0700 Subject: [PATCH 2/5] Describe subrepo setup The previous commit was created by the following command: git subrepo clone https://github.com/nextstrain/ingest ingest/vendored Add a section in the README on how to use this directory in the future. --- ingest/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ingest/README.md b/ingest/README.md index a8911985..92bed530 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -85,3 +85,16 @@ These are optional environment variables used in our automated pipeline for prov GenBank sequences and metadata are fetched via NCBI Virus. The exact URL used to fetch data is constructed in `bin/genbank-url`. + +## `ingest/vendored` + +This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in `ingest/vendored`, from [nextstrain/ingest](https://github.com/nextstrain/ingest). To pull new changes from the central ingest repository, first install `git subrepo`, then run: + +```sh +git subrepo pull ingest/vendored +``` + +Changes should not be pushed using `git subrepo push`. + +1. For pathogen-specific changes, make them in this repository via a pull request. +2. For pathogen-agnostic changes, make them on [nextstrain/ingest](https://github.com/nextstrain/ingest) via pull request there, then use `git subrepo pull` to add those changes to this repository. From 29ddb20b49293239ec28bffb6a91cff7d06b6fbf Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 28 Jul 2023 13:42:48 -0700 Subject: [PATCH 3/5] Use centralized scripts that are functionally identical Remove the copies in this repo and update references. --- .github/workflows/rebuild-all.yaml | 2 +- ingest/bin/cloudfront-invalidate | 42 ------------ ingest/bin/download-from-s3 | 4 +- ingest/bin/merge-user-metadata | 55 ---------------- ingest/bin/notify-on-diff | 3 +- ingest/bin/notify-on-record-change | 3 +- ingest/bin/s3-object-exists | 9 --- ingest/bin/sha256sum | 16 ----- ingest/bin/transform-authors | 66 ------------------- ingest/bin/transform-field-names | 48 -------------- ingest/bin/transform-genbank-location | 43 ------------ ingest/bin/trigger | 56 ---------------- ingest/bin/trigger-on-new-data | 3 +- ingest/bin/upload-to-s3 | 5 +- ingest/workflow/snakemake_rules/transform.smk | 8 +-- 15 files changed, 16 insertions(+), 347 deletions(-) delete mode 100755 ingest/bin/cloudfront-invalidate delete mode 100755 ingest/bin/merge-user-metadata delete mode 100755 ingest/bin/s3-object-exists delete mode 100755 ingest/bin/sha256sum delete mode 100755 ingest/bin/transform-authors delete mode 100755 ingest/bin/transform-field-names delete mode 100755 ingest/bin/transform-genbank-location delete mode 100755 ingest/bin/trigger diff --git a/.github/workflows/rebuild-all.yaml b/.github/workflows/rebuild-all.yaml index cf7984b8..b687e153 100644 --- a/.github/workflows/rebuild-all.yaml +++ b/.github/workflows/rebuild-all.yaml @@ -9,6 +9,6 @@ jobs: steps: - uses: actions/checkout@v3 - name: Repository Dispatch - run: ./ingest/bin/trigger monkeypox rebuild + run: ./ingest/vendored/trigger monkeypox rebuild env: PAT_GITHUB_DISPATCH: ${{ secrets.GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH }} diff --git a/ingest/bin/cloudfront-invalidate b/ingest/bin/cloudfront-invalidate deleted file mode 100755 index dec48529..00000000 --- a/ingest/bin/cloudfront-invalidate +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 -set -euo pipefail - -main() { - local domain="$1" - shift - local paths=("$@") - local distribution invalidation - - echo "-> Finding CloudFront distribution" - distribution=$( - aws cloudfront list-distributions \ - --query "DistributionList.Items[?contains(Aliases.Items, \`$domain\`)] | [0].Id" \ - --output text - ) - - if [[ -z $distribution || $distribution == None ]]; then - exec >&2 - echo "Unable to find CloudFront distribution id for $domain" - echo - echo "Are your AWS CLI credentials for the right account?" - exit 1 - fi - - echo "-> Creating CloudFront invalidation for distribution $distribution" - invalidation=$( - aws cloudfront create-invalidation \ - --distribution-id "$distribution" \ - --paths "${paths[@]}" \ - --query Invalidation.Id \ - --output text - ) - - echo "-> Waiting for CloudFront invalidation $invalidation to complete" - echo " Ctrl-C to stop waiting." - aws cloudfront wait invalidation-completed \ - --distribution-id "$distribution" \ - --id "$invalidation" -} - -main "$@" diff --git a/ingest/bin/download-from-s3 b/ingest/bin/download-from-s3 index 762fe581..99424c77 100755 --- a/ingest/bin/download-from-s3 +++ b/ingest/bin/download-from-s3 @@ -2,7 +2,7 @@ # Originally copied from nextstrain/ncov-ingest repo set -euo pipefail -bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored main() { local src="${1:?A source s3:// URL is required as the first argument.}" @@ -13,7 +13,7 @@ main() { local key="${s3path#*/}" local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - dst_hash="$("$bin/sha256sum" < "$dst" || true)" + dst_hash="$("$vendored/sha256sum" < "$dst" || true)" src_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" echo "[ INFO] Downloading $src → $dst" diff --git a/ingest/bin/merge-user-metadata b/ingest/bin/merge-user-metadata deleted file mode 100755 index 341c2dfa..00000000 --- a/ingest/bin/merge-user-metadata +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" -Merges user curated annotations with the NDJSON records from stdin, with the user -curations overwriting the existing fields. The modified records are output -to stdout. This does not do any additional transformations on top of the user -curations. -""" -import argparse -import csv -import json -from collections import defaultdict -from sys import exit, stdin, stderr, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--annotations", metavar="TSV", required=True, - help="Manually curated annotations TSV file. " + - "The TSV should not have a header and should have exactly three columns: " + - "id to match existing metadata, field name, and field value. " + - "If there are multiple annotations for the same id and field, then the last value is used. " + - "Lines starting with '#' are treated as comments. " + - "Any '#' after the field value are treated as comments.") - parser.add_argument("--id-field", default="accession", - help="The ID field in the metadata to use to merge with the annotations.") - - args = parser.parse_args() - - annotations = defaultdict(dict) - with open(args.annotations, 'r') as annotations_fh: - csv_reader = csv.reader(annotations_fh, delimiter='\t') - for row in csv_reader: - if not row or row[0].lstrip()[0] == '#': - continue - elif len(row) != 3: - print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) - continue - id, field, value = row - annotations[id][field] = value.partition('#')[0].rstrip() - - for record in stdin: - record = json.loads(record) - - record_id = record.get(args.id_field) - if record_id is None: - print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) - exit(1) - - record.update(annotations.get(record_id, {})) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/bin/notify-on-diff b/ingest/bin/notify-on-diff index c304d6b5..32464801 100755 --- a/ingest/bin/notify-on-diff +++ b/ingest/bin/notify-on-diff @@ -6,6 +6,7 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored src="${1:?A source file is required as the first argument.}" dst="${2:?A destination s3:// URL is required as the second argument.}" @@ -16,7 +17,7 @@ diff="$(mktemp -t diff-XXXXXX)" trap "rm -f '$dst_local' '$diff'" EXIT # if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 +"$vendored"/s3-object-exists "$dst" || exit 0 "$bin"/download-from-s3 "$dst" "$dst_local" diff --git a/ingest/bin/notify-on-record-change b/ingest/bin/notify-on-record-change index 595835b5..6487fbcb 100755 --- a/ingest/bin/notify-on-record-change +++ b/ingest/bin/notify-on-record-change @@ -6,13 +6,14 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored src="${1:?A source ndjson file is required as the first argument.}" dst="${2:?A destination ndjson s3:// URL is required as the second argument.}" source_name=${3:?A record source name is required as the third argument.} # if the file is not already present, just exit -"$bin"/s3-object-exists "$dst" || exit 0 +"$vendored"/s3-object-exists "$dst" || exit 0 s3path="${dst#s3://}" bucket="${s3path%%/*}" diff --git a/ingest/bin/s3-object-exists b/ingest/bin/s3-object-exists deleted file mode 100755 index d586d0b8..00000000 --- a/ingest/bin/s3-object-exists +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest -set -euo pipefail - -url="${1#s3://}" -bucket="${url%%/*}" -key="${url#*/}" - -aws s3api head-object --bucket "$bucket" --key "$key" &>/dev/null diff --git a/ingest/bin/sha256sum b/ingest/bin/sha256sum deleted file mode 100755 index aa05af00..00000000 --- a/ingest/bin/sha256sum +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 -# Originally copied from nextstrain/ncov-ingest repo -""" -Portable sha256sum utility. -""" -from hashlib import sha256 -from sys import stdin - -chunk_size = 5 * 1024**2 # 5 MiB - -h = sha256() - -for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): - h.update(chunk) - -print(h.hexdigest()) diff --git a/ingest/bin/transform-authors b/ingest/bin/transform-authors deleted file mode 100755 index 0bade20e..00000000 --- a/ingest/bin/transform-authors +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -""" -Abbreviates a full list of authors to be ' et al.' of the NDJSON -record from stdin and outputs modified records to stdout. - -Note: This is a "best effort" approach and can potentially mangle the author name. -""" -import argparse -import json -import re -from sys import stderr, stdin, stdout - - -def parse_authors(record: dict, authors_field: str, default_value: str, - index: int, abbr_authors_field: str = None) -> dict: - # Strip and normalize whitespace - new_authors = re.sub(r'\s+', ' ', record[authors_field]) - - if new_authors == "": - new_authors = default_value - else: - # Split authors list on comma/semicolon - # OR "and"/"&" with at least one space before and after - new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] - - # if it does not already end with " et al.", add it - if not new_authors.strip('. ').endswith(" et al"): - new_authors += ' et al' - - if abbr_authors_field: - if record.get(abbr_authors_field): - print( - f"WARNING: the {abbr_authors_field!r} field already exists", - f"in record {index} and will be overwritten!", - file=stderr - ) - - record[abbr_authors_field] = new_authors - else: - record[authors_field] = new_authors - - return record - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--authors-field", default="authors", - help="The field containing list of authors.") - parser.add_argument("--default-value", default="?", - help="Default value to use if authors list is empty.") - parser.add_argument("--abbr-authors-field", - help="The field for the generated abbreviated authors. " + - "If not provided, the original authors field will be modified.") - - args = parser.parse_args() - - for index, record in enumerate(stdin): - record = json.loads(record) - - parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/bin/transform-field-names b/ingest/bin/transform-field-names deleted file mode 100755 index fde223fc..00000000 --- a/ingest/bin/transform-field-names +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Renames fields of the NDJSON record from stdin and outputs modified records -to stdout. -""" -import argparse -import json -from sys import stderr, stdin, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--field-map", nargs="+", - help="Fields names in the NDJSON record mapped to new field names, " + - "formatted as '{old_field_name}={new_field_name}'. " + - "If the old field does not exist in record, the new field will be added with an empty string value." + - "If the new field already exists in record, then the renaming of the old field will be skipped.") - parser.add_argument("--force", action="store_true", - help="Force renaming of old field even if the new field already exists. " + - "Please keep in mind this will overwrite the value of the new field.") - - args = parser.parse_args() - - field_map = {} - for field in args.field_map: - old_name, new_name = field.split('=') - field_map[old_name] = new_name - - for record in stdin: - record = json.loads(record) - - for old_field, new_field in field_map.items(): - - if record.get(new_field) and not args.force: - print( - f"WARNING: skipping rename of {old_field} because record", - f"already has a field named {new_field}.", - file=stderr - ) - continue - - record[new_field] = record.pop(old_field, '') - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/bin/transform-genbank-location b/ingest/bin/transform-genbank-location deleted file mode 100755 index 70ba56fb..00000000 --- a/ingest/bin/transform-genbank-location +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -""" -Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate -fields: 'country', 'division', and 'location'. Checks that a record is from -GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". - -Outputs the modified record to stdout. -""" -import json -from sys import stdin, stdout - - -def parse_location(record: dict) -> dict: - # Expected pattern for the location field is "[:][, ]" - # See GenBank docs for their "country" field: - # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ - geographic_data = record['location'].split(':') - - country = geographic_data[0] - division = '' - location = '' - - if len(geographic_data) == 2: - division , _ , location = geographic_data[1].partition(',') - - record['country'] = country.strip() - record['division'] = division.strip() - record['location'] = location.strip() - - return record - - -if __name__ == '__main__': - - for record in stdin: - record = json.loads(record) - - database = record.get('database', '') - if database in {'GenBank', 'RefSeq'}: - parse_location(record) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/bin/trigger b/ingest/bin/trigger deleted file mode 100755 index d40553b6..00000000 --- a/ingest/bin/trigger +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${PAT_GITHUB_DISPATCH:=}" - -repo="${1:?A repository name is required as the first argument.}" -event_type="${2:?An event type is required as the second argument.}" -shift 2 - -if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then - cat >&2 <<. -You must specify options to curl for your GitHub credentials. For example, you -can specify your GitHub username, and will be prompted for your password: - - $0 $repo $event_type --user - -Be sure to enter a personal access token¹ as your password since GitHub has -discontinued password authentication to the API starting on November 13, 2020². - -You can also store your credentials or a personal access token in a netrc -file³: - - machine api.github.com - login - password - -and then tell curl to use it: - - $0 $repo $event_type --netrc - -which will then not require you to type your password every time. - -¹ https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line -² https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password -³ https://ec.haxx.se/usingcurl/usingcurl-netrc -. - exit 1 -fi - -auth=':' -if [[ -n $PAT_GITHUB_DISPATCH ]]; then - auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}" -fi - -if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \ - -H 'Accept: application/vnd.github.v3+json' \ - -H 'Content-Type: application/json' \ - -H "$auth" \ - -d '{"event_type":"'"$event_type"'"}' \ - "$@" -then - echo "Successfully triggered $event_type" -else - echo "Request failed" >&2 - exit 1 -fi diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data index 760a0187..86f40e2f 100755 --- a/ingest/bin/trigger-on-new-data +++ b/ingest/bin/trigger-on-new-data @@ -4,6 +4,7 @@ set -euo pipefail : "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored metadata="${1:?A metadata upload output file is required as the first argument.}" sequences="${2:?An sequence FASTA upload output file is required as the second argument.}" @@ -17,7 +18,7 @@ slack_message="" # grep exit status 0 for found match, 1 for no match, 2 if an error occurred if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then slack_message="Triggering new builds due to updated metadata and/or sequences" - "$bin"/trigger "monkeypox" "rebuild" + "$vendored"/trigger "monkeypox" "rebuild" elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files." else diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3 index 78b35edd..2c0394f8 100755 --- a/ingest/bin/upload-to-s3 +++ b/ingest/bin/upload-to-s3 @@ -3,6 +3,7 @@ set -euo pipefail bin="$(dirname "$0")" +vendored="$(dirname "$0")"/../vendored main() { local quiet=0 @@ -26,7 +27,7 @@ main() { local key="${s3path#*/}" local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 - src_hash="$("$bin/sha256sum" < "$src")" + src_hash="$("$vendored/sha256sum" < "$src")" dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" if [[ $src_hash != "$dst_hash" ]]; then @@ -46,7 +47,7 @@ main() { if [[ -n $cloudfront_domain ]]; then echo "Creating CloudFront invalidation for $cloudfront_domain/$key" - if ! "$bin"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then + if ! "$vendored"/cloudfront-invalidate "$cloudfront_domain" "/$key"; then echo "CloudFront invalidation failed, but exiting with success anyway." fi fi diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index b08fc816..1a1726e2 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -65,7 +65,7 @@ rule transform: shell: """ (cat {input.sequences_ndjson} \ - | ./bin/transform-field-names \ + | ./vendored/transform-field-names \ --field-map {params.field_map} \ | augur curate normalize-strings \ | ./bin/transform-strain-names \ @@ -74,18 +74,18 @@ rule transform: | ./bin/transform-date-fields \ --date-fields {params.date_fields} \ --expected-date-formats {params.expected_date_formats} \ - | ./bin/transform-genbank-location \ + | ./vendored/transform-genbank-location \ | ./bin/transform-string-fields \ --titlecase-fields {params.titlecase_fields} \ --articles {params.articles} \ --abbreviations {params.abbreviations} \ - | ./bin/transform-authors \ + | ./vendored/transform-authors \ --authors-field {params.authors_field} \ --default-value {params.authors_default_value} \ --abbr-authors-field {params.abbr_authors_field} \ | ./bin/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ - | ./bin/merge-user-metadata \ + | ./vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \ | ./bin/ndjson-to-tsv-and-fasta \ From de3f6e939867618bc0e999ebca2587eaaac21582 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 28 Jul 2023 13:50:15 -0700 Subject: [PATCH 4/5] Use centralized Slack notification scripts Remove the copies in this repo and update references. Add new positional arguments required by the centralized scripts. --- .github/workflows/fetch-and-ingest.yaml | 2 +- bin/notify-on-deploy | 4 +- bin/notify-on-error | 4 +- bin/notify-on-start | 4 +- bin/notify-on-success | 4 +- ingest/bin/notify-on-diff | 4 +- ingest/bin/notify-on-job-fail | 21 ------- ingest/bin/notify-on-job-start | 24 -------- ingest/bin/notify-on-record-change | 2 +- ingest/bin/notify-slack | 58 ------------------- ingest/bin/trigger-on-new-data | 2 +- ingest/bin/upload-to-s3 | 2 +- .../snakemake_rules/slack_notifications.smk | 4 +- 13 files changed, 16 insertions(+), 119 deletions(-) delete mode 100755 ingest/bin/notify-on-job-fail delete mode 100755 ingest/bin/notify-on-job-start delete mode 100755 ingest/bin/notify-slack diff --git a/.github/workflows/fetch-and-ingest.yaml b/.github/workflows/fetch-and-ingest.yaml index 22cbf667..5c789677 100644 --- a/.github/workflows/fetch-and-ingest.yaml +++ b/.github/workflows/fetch-and-ingest.yaml @@ -73,4 +73,4 @@ jobs: - name: notify_pipeline_failed if: ${{ failure() }} - run: ./ingest/bin/notify-on-job-fail + run: ./ingest/vendored/notify-on-job-fail Ingest nextstrain/monkeypox diff --git a/bin/notify-on-deploy b/bin/notify-on-deploy index 77763e74..e279a340 100755 --- a/bin/notify-on-deploy +++ b/bin/notify-on-deploy @@ -5,11 +5,11 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" base="$(realpath "$(dirname "$0")/..")" -ingest_bin="$base/ingest/bin" +ingest_vendored="$base/ingest/vendored" deployment_url="${1:?A deployment url is required as the first argument.}" slack_ts_file="${2:?A Slack thread timestamp file is required as the second argument.}" echo "Notifying Slack about deployed builds." -"$ingest_bin"/notify-slack "Deployed this build to $deployment_url" \ +"$ingest_vendored"/notify-slack "Deployed this build to $deployment_url" \ --thread-ts="$(cat "$slack_ts_file")" diff --git a/bin/notify-on-error b/bin/notify-on-error index bbf82a91..810ce8b1 100755 --- a/bin/notify-on-error +++ b/bin/notify-on-error @@ -8,7 +8,7 @@ set -euo pipefail : "${GITHUB_RUN_ID:=}" base="$(realpath "$(dirname "$0")/..")" -ingest_bin="$base/ingest/bin" +ingest_vendored="$base/ingest/vendored" slack_ts_file="${1:-}" @@ -26,6 +26,6 @@ elif [[ -n "${GITHUB_RUN_ID}" ]]; then message+="See GitHub Action for error details." fi -"$ingest_bin"/notify-slack "$message" \ +"$ingest_vendored"/notify-slack "$message" \ --thread-ts="$thread_ts" \ --broadcast diff --git a/bin/notify-on-start b/bin/notify-on-start index 6042e060..c0695ea4 100755 --- a/bin/notify-on-start +++ b/bin/notify-on-start @@ -8,7 +8,7 @@ set -euo pipefail : "${GITHUB_RUN_ID:=}" base="$(realpath "$(dirname "$0")/..")" -ingest_bin="$base/ingest/bin" +ingest_vendored="$base/ingest/vendored" build_name="${1:?A build name is required as the first argument.}" slack_ts_output="${2:?A Slack thread timestamp file is required as the second argument}" @@ -29,7 +29,7 @@ if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} . "'```' fi -"$ingest_bin"/notify-slack "$message" --output="$slack_response" +"$ingest_vendored"/notify-slack "$message" --output="$slack_response" echo "Saving Slack thread timestamp to '$slack_ts_output'." diff --git a/bin/notify-on-success b/bin/notify-on-success index f4b21c37..69b85c8f 100755 --- a/bin/notify-on-success +++ b/bin/notify-on-success @@ -5,10 +5,10 @@ set -euo pipefail : "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" base="$(realpath "$(dirname "$0")/..")" -ingest_bin="$base/ingest/bin" +ingest_vendored="$base/ingest/vendored" slack_ts_file="${1:?A Slack thread timestamp file is required as the first argument.}" echo "Notifying Slack about successful build." -"$ingest_bin"/notify-slack "✅ This pipeline has successfully finished 🎉" \ +"$ingest_vendored"/notify-slack "✅ This pipeline has successfully finished 🎉" \ --thread-ts="$(cat "$slack_ts_file")" diff --git a/ingest/bin/notify-on-diff b/ingest/bin/notify-on-diff index 32464801..e401f124 100755 --- a/ingest/bin/notify-on-diff +++ b/ingest/bin/notify-on-diff @@ -27,10 +27,10 @@ diff "$dst_local" "$src" > "$diff" || diff_exit_code=$? if [[ "$diff_exit_code" -eq 1 ]]; then echo "Notifying Slack about diff." - "$bin"/notify-slack --upload "$src.diff" < "$diff" + "$vendored"/notify-slack --upload "$src.diff" < "$diff" elif [[ "$diff_exit_code" -gt 1 ]]; then echo "Notifying Slack about diff failure" - "$bin"/notify-slack "Diff failed for $src" + "$vendored"/notify-slack "Diff failed for $src" else echo "No change in $src." fi diff --git a/ingest/bin/notify-on-job-fail b/ingest/bin/notify-on-job-fail deleted file mode 100755 index 23d3a926..00000000 --- a/ingest/bin/notify-on-job-fail +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -: "${AWS_BATCH_JOB_ID:=}" -: "${GITHUB_RUN_ID:=}" - -bin="$(dirname "$0")" - -echo "Notifying Slack about failed ingest job." -message="❌ Ingest job has FAILED 😞 " - -if [ -n "${AWS_BATCH_JOB_ID}" ]; then - message+="See AWS Batch job \`${AWS_BATCH_JOB_ID}\` () for error details. " -elif [ -n "${GITHUB_RUN_ID}" ]; then - message+="See GitHub Action for error details. " -fi - -"$bin"/notify-slack "$message" diff --git a/ingest/bin/notify-on-job-start b/ingest/bin/notify-on-job-start deleted file mode 100755 index 9410fa38..00000000 --- a/ingest/bin/notify-on-job-start +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -: "${AWS_BATCH_JOB_ID:=}" -: "${GITHUB_RUN_ID:=}" - -bin="$(dirname "$0")" - -echo "Notifying Slack about started ingest job." -message="🐵 Monkeypox ingest job has started." - -if [[ -n "${GITHUB_RUN_ID}" ]]; then - message+=" The job was submitted by GitHub Action ." -fi - -if [[ -n "${AWS_BATCH_JOB_ID}" ]]; then - message+=" The job was launched as AWS Batch job \`${AWS_BATCH_JOB_ID}\` ()." - message+=" Follow along in your local \`monkeypox\` repo with: "'```'"nextstrain build --aws-batch --no-download --attach ${AWS_BATCH_JOB_ID} ingest/"'```' -fi - -"$bin"/notify-slack "$message" diff --git a/ingest/bin/notify-on-record-change b/ingest/bin/notify-on-record-change index 6487fbcb..3987a192 100755 --- a/ingest/bin/notify-on-record-change +++ b/ingest/bin/notify-on-record-change @@ -52,4 +52,4 @@ fi slack_message+=" (Total record count: $src_record_count)" -"$bin"/notify-slack "$slack_message" +"$vendored"/notify-slack "$slack_message" diff --git a/ingest/bin/notify-slack b/ingest/bin/notify-slack deleted file mode 100755 index 6ca20dec..00000000 --- a/ingest/bin/notify-slack +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Originally copied from nextstrain/ncov-ingest repo -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -upload=0 -output=/dev/null -thread_ts="" -broadcast=0 -args=() - -for arg; do - case "$arg" in - --upload) - upload=1;; - --output=*) - output="${arg#*=}";; - --thread-ts=*) - thread_ts="${arg#*=}";; - --broadcast) - broadcast=1;; - *) - args+=("$arg");; - esac -done - -set -- "${args[@]}" - -text="${1:?Some message text is required.}" - -if [[ "$upload" == 1 ]]; then - echo "Uploading data to Slack with the message: $text" - curl https://slack.com/api/files.upload \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channels="$SLACK_CHANNELS" \ - --form-string title="$text" \ - --form-string filename="$text" \ - --form-string thread_ts="$thread_ts" \ - --form-string reply_broadcast="$broadcast" \ - --form file=@/dev/stdin \ - --form filetype=text \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -else - echo "Posting Slack message: $text" - curl https://slack.com/api/chat.postMessage \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channel="$SLACK_CHANNELS" \ - --form-string text="$text" \ - --form-string thread_ts="$thread_ts" \ - --form-string reply_broadcast="$broadcast" \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -fi diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data index 86f40e2f..97c85ff7 100755 --- a/ingest/bin/trigger-on-new-data +++ b/ingest/bin/trigger-on-new-data @@ -26,6 +26,6 @@ else fi -if ! "$bin"/notify-slack "$slack_message"; then +if ! "$vendored"/notify-slack "$slack_message"; then echo "Notifying Slack failed, but exiting with success anyway." fi diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3 index 2c0394f8..cc867755 100755 --- a/ingest/bin/upload-to-s3 +++ b/ingest/bin/upload-to-s3 @@ -57,7 +57,7 @@ main() { exit 0 fi - if ! "$bin"/notify-slack "Updated $dst available."; then + if ! "$vendored"/notify-slack "Updated $dst available."; then echo "Notifying Slack failed, but exiting with success anyway." fi else diff --git a/ingest/workflow/snakemake_rules/slack_notifications.smk b/ingest/workflow/snakemake_rules/slack_notifications.smk index b4a54753..d19b848b 100644 --- a/ingest/workflow/snakemake_rules/slack_notifications.smk +++ b/ingest/workflow/snakemake_rules/slack_notifications.smk @@ -48,8 +48,8 @@ rule notify_on_metadata_diff: onstart: - shell("./bin/notify-on-job-start") + shell("./vendored/notify-on-job-start Ingest nextstrain/monkeypox") onerror: - shell("./bin/notify-on-job-fail") + shell("./vendored/notify-on-job-fail Ingest nextstrain/monkeypox") From 61a69be34b14ee9b246cea5dd53442378f640dd3 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 28 Jul 2023 14:37:56 -0700 Subject: [PATCH 5/5] Remove unused bin variable --- ingest/bin/trigger-on-new-data | 1 - ingest/bin/upload-to-s3 | 1 - 2 files changed, 2 deletions(-) diff --git a/ingest/bin/trigger-on-new-data b/ingest/bin/trigger-on-new-data index 97c85ff7..1f3e0bac 100755 --- a/ingest/bin/trigger-on-new-data +++ b/ingest/bin/trigger-on-new-data @@ -3,7 +3,6 @@ set -euo pipefail : "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored metadata="${1:?A metadata upload output file is required as the first argument.}" diff --git a/ingest/bin/upload-to-s3 b/ingest/bin/upload-to-s3 index cc867755..d913182f 100755 --- a/ingest/bin/upload-to-s3 +++ b/ingest/bin/upload-to-s3 @@ -2,7 +2,6 @@ # Originally copied from nextstrain/ncov-ingest repo set -euo pipefail -bin="$(dirname "$0")" vendored="$(dirname "$0")"/../vendored main() {