Skip to content
This repository has been archived by the owner on Aug 23, 2024. It is now read-only.

Commit

Permalink
Add script to dispatch dbt tests after sqoop import (#5)
Browse files Browse the repository at this point in the history
* Add crawler trigger to run.sh

* Add initial scripts for dispatching GH workflows

* Update example .env file

* Fix small bash quoting issues

* Fix Glue crawler name

* Add logging messages in dispatch script
  • Loading branch information
dfsnow authored Aug 21, 2023
1 parent 980d904 commit 6e0653b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ IPTS_HOSTNAME=
IPTS_PORT=
IPTS_SERVICE_NAME=
IPTS_USERNAME=
GH_APP_ID=
GH_PEM_PATH=
13 changes: 13 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ START_TIME=$(date +%s)
TEMP_LOG_FILE="logs/temp-sqoop-log"
BACKUP_LOG_FILE="logs/backup-sqoop-log"
BUCKET_URI="s3://ccao-data-warehouse-us-east-1"
CRAWLER_NAME="ccao-data-warehouse-iasworld-crawler"
LOG_GROUP_NAME="/ccao/jobs/sqoop"

# Run all sqoop jobs to extract tables
Expand Down Expand Up @@ -59,6 +60,18 @@ done
# Delete any remaining empty dirs
find target/ -type d -empty -delete

# Kick off Glue crawler run. Not strictly necessary since 99%
# of the time we're not creating new partitions or columns,
# but still nice to run
echo "Starting AWS Glue crawler run" | ts '%.s' | tee -a "$TEMP_LOG_FILE"
/usr/bin/aws glue start-crawler --name "$CRAWLER_NAME" || true

# Trigger a workflow to run all dbt tests now that new data is uploaded, but
# don't let this step crash the log upload
source scripts/dispatch-dbt-workflow.sh || true \
| ts '%.s' \
| tee -a "$TEMP_LOG_FILE"

# Print overall runtime stats and tables extracted
END_TIME=$(date +%s)
RUNTIME=$((END_TIME - START_TIME))
Expand Down
38 changes: 38 additions & 0 deletions scripts/dispatch-dbt-workflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

GH_API_REPO="https://api.github.com/repos/ccao-data/data-architecture"
GH_DBT_WORKFLOW="test_dbt_models.yaml"

# First auth as a GitHub app using JSON Web Token (JWT).
# Uses a local PEM file and python lib to construct the JWT
echo "Generating JWT to send to GitHub auth"
GH_JWT=$(python3 scripts/get-jwt.py)

# Grab the token URL from our current installation
echo "Fetching GitHub tokens URL"
GH_TOKENS_URL=$(curl -s --request GET \
--url "https://api.github.com/app/installations" \
--header "Accept: application/vnd.github+json" \
--header "Authorization: Bearer ${GH_JWT}" \
--header "X-GitHub-Api-Version: 2022-11-28" \
| jq -r '.[].access_tokens_url')

# Auth against the tokens URL to get a short-lived (60 second) token
echo "Fetching temporary GitHub auth token"
GH_TOKEN=$(curl -s -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${GH_JWT}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"$GH_TOKENS_URL" \
| jq -r '.token')

# Use the token to call the API and dispatch the workflow
echo "Dispatching workflow"
curl -v -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${GH_TOKEN}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"$GH_API_REPO"/actions/workflows/"$GH_DBT_WORKFLOW"/dispatches \
-d '{"ref": "master"}'
23 changes: 23 additions & 0 deletions scripts/get-jwt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3
import jwt
import os
import time
from dotenv import load_dotenv

load_dotenv()

GH_APP_ID = os.getenv("GH_APP_ID")
GH_PEM_PATH = os.getenv("GH_PEM_PATH")

with open(GH_PEM_PATH, "rb") as pem_file:
signing_key = jwt.jwk_from_pem(pem_file.read())

payload = {
"iat": int(time.time()),
"exp": int(time.time()) + 60,
"iss": GH_APP_ID
}

jwt_instance = jwt.JWT()
encoded_jwt = jwt_instance.encode(payload, signing_key, alg="RS256")
print(encoded_jwt)

0 comments on commit 6e0653b

Please sign in to comment.