diff --git a/.env b/.env index 443c5e1..6dc3d11 100644 --- a/.env +++ b/.env @@ -3,3 +3,5 @@ IPTS_HOSTNAME= IPTS_PORT= IPTS_SERVICE_NAME= IPTS_USERNAME= +GH_APP_ID= +GH_PEM_PATH= diff --git a/run.sh b/run.sh index f196ef0..b4e5fde 100755 --- a/run.sh +++ b/run.sh @@ -5,6 +5,7 @@ START_TIME=$(date +%s) TEMP_LOG_FILE="logs/temp-sqoop-log" BACKUP_LOG_FILE="logs/backup-sqoop-log" BUCKET_URI="s3://ccao-data-warehouse-us-east-1" +CRAWLER_NAME="ccao-data-warehouse-iasworld-crawler" LOG_GROUP_NAME="/ccao/jobs/sqoop" # Run all sqoop jobs to extract tables @@ -59,6 +60,18 @@ done # Delete any remaining empty dirs find target/ -type d -empty -delete +# Kick off Glue crawler run. Not strictly necessary since 99% +# of the time we're not creating new partitions or columns, +# but still nice to run +echo "Starting AWS Glue crawler run" | ts '%.s' | tee -a "$TEMP_LOG_FILE" +/usr/bin/aws glue start-crawler --name "$CRAWLER_NAME" || true + +# Trigger a workflow to run all dbt tests now that new data is uploaded, but +# don't let this step crash the log upload +source scripts/dispatch-dbt-workflow.sh || true \ + | ts '%.s' \ + | tee -a "$TEMP_LOG_FILE" + # Print overall runtime stats and tables extracted END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) diff --git a/scripts/dispatch-dbt-workflow.sh b/scripts/dispatch-dbt-workflow.sh new file mode 100755 index 0000000..444e886 --- /dev/null +++ b/scripts/dispatch-dbt-workflow.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +GH_API_REPO="https://api.github.com/repos/ccao-data/data-architecture" +GH_DBT_WORKFLOW="test_dbt_models.yaml" + +# First auth as a GitHub app using JSON Web Token (JWT). +# Uses a local PEM file and python lib to construct the JWT +echo "Generating JWT to send to GitHub auth" +GH_JWT=$(python3 scripts/get-jwt.py) + +# Grab the token URL from our current installation +echo "Fetching GitHub tokens URL" +GH_TOKENS_URL=$(curl -s --request GET \ + --url "https://api.github.com/app/installations" \ + --header "Accept: application/vnd.github+json" \ + --header "Authorization: Bearer ${GH_JWT}" \ + --header "X-GitHub-Api-Version: 2022-11-28" \ + | jq -r '.[].access_tokens_url') + +# Auth against the tokens URL to get a short-lived (60 second) token +echo "Fetching temporary GitHub auth token" +GH_TOKEN=$(curl -s -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GH_JWT}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "$GH_TOKENS_URL" \ + | jq -r '.token') + +# Use the token to call the API and dispatch the workflow +echo "Dispatching workflow" +curl -v -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "$GH_API_REPO"/actions/workflows/"$GH_DBT_WORKFLOW"/dispatches \ + -d '{"ref": "master"}' diff --git a/scripts/get-jwt.py b/scripts/get-jwt.py new file mode 100644 index 0000000..6691e5a --- /dev/null +++ b/scripts/get-jwt.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +import jwt +import os +import time +from dotenv import load_dotenv + +load_dotenv() + +GH_APP_ID = os.getenv("GH_APP_ID") +GH_PEM_PATH = os.getenv("GH_PEM_PATH") + +with open(GH_PEM_PATH, "rb") as pem_file: + signing_key = jwt.jwk_from_pem(pem_file.read()) + +payload = { + "iat": int(time.time()), + "exp": int(time.time()) + 60, + "iss": GH_APP_ID +} + +jwt_instance = jwt.JWT() +encoded_jwt = jwt_instance.encode(payload, signing_key, alg="RS256") +print(encoded_jwt)