Skip to content

Fetch and process data every day #72

Fetch and process data every day

Fetch and process data every day #72

name: Fetch and process data every day
on:
schedule:
- cron: '0 */4 * * *'
workflow_dispatch:
env:
DVC_ACCESS_KEY_ID: ${{ secrets.DVC_ACCESS_KEY_ID }}
DVC_SECRET_ACCESS_KEY: ${{ secrets.DVC_SECRET_ACCESS_KEY }}
jobs:
test_the_apis:
runs-on: ubuntu-20.04
timeout-minutes: 20
steps:
- name: Checkout respository
uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: cache poetry install
uses: actions/cache@v2
with:
path: ~/.local
key: poetry-1.8.2
- uses: snok/install-poetry@v1
with:
version: 1.8.2
virtualenvs-create: true
virtualenvs-in-project: true
- name: cache deps
id: cache-deps
uses: actions/cache@v2
with:
path: .venv
key: pydeps-${{ hashFiles('**/poetry.lock') }}
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Test APIs
run: |
poetry run python tests/test_weather_api.py
fetch_strava_data:
needs: test_the_apis
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-dependencies.outputs.cache-hit != 'true'
- name: Install dependencies
run: |
poetry install
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Fetch strava data
run: |
poetry run python src/data/fetch_strava.py
if [ $? -eq 0 ]; then
echo "fetch_strava.py executed successfully."
else
echo "Error executing fetch_strava.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: "action: Fetch strava data from API"
branch: main
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
process_strava_data:
needs: fetch_strava_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.fetch_strava_data.outputs.sha_new }}
with:
ref: ${{ needs.fetch_strava_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Process strava data
run: |
poetry run python src/data/preprocess_strava.py
if [ $? -eq 0 ]; then
echo "preprocess_strava.py executed successfully."
else
echo "Error executing preprocess_strava.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: Preprocess Strava data"
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
fetch_weather_data:
needs: process_strava_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.process_strava_data.outputs.sha_new }}
with:
ref: ${{ needs.process_strava_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Fetch weather data
run: |
poetry run python src/data/fetch_weather.py
if [ $? -eq 0 ]; then
echo "fetch_weather.py executed successfully."
else
echo "Error executing fetch_weather.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: Fetching weather"
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
process_weather_data:
needs: fetch_weather_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.fetch_weather_data.outputs.sha_new }}
with:
ref: ${{ needs.fetch_weather_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Preprocess weather
run: |
poetry run python src/data/preprocess_weather.py
if [ $? -eq 0 ]; then
echo "preprocess_weather.py executed successfully."
else
echo "Error executing preprocess_weather.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: preprocess weather data"
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
merge_data:
needs: process_weather_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.process_weather_data.outputs.sha_new }}
with:
ref: ${{ needs.process_weather_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Merge data
run: |
poetry run python src/data/merge_data.py
if [ $? -eq 0 ]; then
echo "merge_data.py executed successfully."
else
echo "Error executing merge_data.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: merge data"
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
validate_data:
needs: merge_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.merge_data.outputs.sha_new }}
with:
ref: ${{ needs.merge_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Update current data
run: |
cp data/processed/is_active.csv data/current_reference/is_active_current.csv
cp data/processed/kudos_dataset.csv data/current_reference/kudos_current.csv
- name: Validate data
run: |
poetry run python src/data/validate_data.py
if [ $? -eq 0 ]; then
echo "validate_data.py executed successfully."
else
echo "Error executing validate_data.py"
exit 1
fi
- name: Check if validation report exists
id: check-report
run: |
if [ -d "gx/uncommitted/data_docs/local_site" ]; then
echo "Report directory exists."
echo "report-exists=true" >> $GITHUB_ENV
else
echo "Report directory does not exist."
echo "report-exists=false" >> $GITHUB_ENV
fi
- name: Deploy validation output to Netlify
if: env.report-exists == 'true'
uses: nwtgck/actions-netlify@v1.2
with:
publish-dir: "gx/uncommitted/data_docs/local_site"
production-deploy: true
env:
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
test_data:
needs: validate_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.validate_data.outputs.sha_new }}
with:
ref: ${{ needs.validate_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Update current data
run: |
cp data/processed/is_active.csv data/current_reference/is_active_current.csv
cp data/processed/kudos_dataset.csv data/current_reference/kudos_current.csv
- name: Test data
run: |
poetry run python src/data/evidently_test.py
if [ $? -eq 0 ]; then
echo "evidently_test.py executed successfully."
else
echo "Error executing evidently_test.py"
exit 1
fi
- name: Check if evidently file exists
id: check-report
run: |
if [ -d "reports/evidently" ]; then
echo "Report directory exists."
echo "report-exists=true" >> $GITHUB_ENV
else
echo "Report directory does not exist."
echo "report-exists=false" >> $GITHUB_ENV
fi
- name: Deploy validation output to Netlify
if: env.report-exists == 'true'
uses: nwtgck/actions-netlify@v1.2
with:
publish-dir: "reports/evidently"
production-deploy: true
env:
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SECOND_SITE_ID }}
- name: Update reference data
run: |
cp data/current_reference/is_active_current.csv data/current_reference/is_active_reference.csv
cp data/current_reference/kudos_current.csv data/current_reference/kudos_reference.csv
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: test data"
- name: Get commit SHA and store it in GITHUB_OUTPUT
id: sha_new
run: echo "SHA_NEW=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
split_data:
needs: test_data
runs-on: ubuntu-20.04
timeout-minutes: 20
outputs:
sha_new: ${{ steps.sha_new.outputs.SHA_NEW }}
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v2
env:
SHA_NEW: ${{ needs.test_data.outputs.sha_new }}
with:
ref: ${{ needs.test_data.outputs.sha_new }}
fetch-depth: 0
- name: Cache dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.2
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry --version
- run: poetry install --no-interaction --no-root
if: steps.cache-deps.outputs.cache-hit != 'true'
- run: poetry install --no-interaction
- name: Install DVC
uses: iterative/setup-dvc@v1
- name: Configure Dagshub
run: |
poetry run dvc remote modify origin endpointurl https://dagshub.com/JanHuntersi/strava_prediction.s3
poetry run dvc remote modify origin --local access_key_id $DVC_ACCESS_KEY_ID
poetry run dvc remote modify origin --local secret_access_key $DVC_SECRET_ACCESS_KEY
- name: DVC Pull Data
run: |
poetry run dvc pull -r origin
- name: Split data
run: |
poetry run python src/data/split_data.py
if [ $? -eq 0 ]; then
echo "split_data.py executed successfully."
else
echo "Error executing split_data.py"
exit 1
fi
- name: Add data to DVC
run: |
poetry run dvc add data
- name: DVC push
run: |
poetry run dvc push -r origin
- uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: main
commit_message: "action: split data"