Skip to content

instant benchmark tooling #92

instant benchmark tooling

instant benchmark tooling #92

name: instant benchmark tooling
on:
workflow_dispatch:
inputs:
running_template:
description: 'A json file that contains benchmark plans'
required: true
instance:
description: 'Instance used for benchmark'
required: true
default: 'g5.12xlarge'
type: choice
options:
- g5.2xlarge
- g5.12xlarge
- g5.48xlarge
- g4dn.12xlarge
- g4dn.2xlarge
- p4d.24xlarge
- inf2.8xlarge
- inf2.24xlarge
- trn1.2xlarge
- trn1.32xlarge
container:
description: 'The container used to run benchmark (overrides the template). Should be a full docker path such as deepjavalibrary/djl-serving:0.25.0-deepspeed'
required: false
default: ''
record:
description: 'Whether to record the results'
default: 'none'
type: choice
options:
- none
- table
- cloudwatch
permissions:
id-token: write
contents: read
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new instance
id: create_instance
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_ib_${{ github.event.inputs.instance }} $token djl-serving
outputs:
gpu_instance_id: ${{ steps.create_instance.outputs.action_ib_instance_id }}
environment-setup:
runs-on: [ self-hosted, "${{ github.event.inputs.instance }}" ]
timeout-minutes: 15
needs: [ create-runners ]
steps:
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Clean env
run: |
yes | docker system prune -a --volumes
- name: install deps
run: |
sudo apt-get update
sudo apt-get install awscli -y
pip3 install boto3
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-east-1
- name: Parse job schema
working-directory: tests/integration
id: generate_matrix
run: |
python3 instant_benchmark.py --parse ${{ github.event.inputs.running_template }} \
--container "${{ github.event.inputs.container }}"
outputs:
jobs: ${{ steps.generate_matrix.outputs.jobs }}
template: ${{ steps.generate_matrix.outputs.template }}
benchmark_run:
runs-on: [ self-hosted, "${{ github.event.inputs.instance }}" ]
timeout-minutes: 30
needs: [ environment-setup ]
strategy:
matrix:
job: ${{ fromJSON(needs.environment-setup.outputs.jobs) }}
steps:
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: install deps
run: |
sudo apt-get update
sudo apt-get install awscli -y
pip3 install boto3
- name: Setup awscurl
working-directory: tests/integration
run: |
wget https://github.com/frankfliu/junkyard/releases/download/v0.3.1/awscurl
chmod +x awscurl
- name: Run benchmark job
working-directory: tests/integration
run: |
echo "${{ needs.environment-setup.outputs.template }}" >> template.json
python3 instant_benchmark.py --template template.json \
--job ${{ matrix.job }} --instance ${{ github.event.inputs.instance }}
bash instant_benchmark.sh
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-east-1
- name: Record benchmark job
if: ${{ github.event.inputs.record == 'table' || github.event.inputs.record == 'cloudwatch' }}
working-directory: tests/integration
run: |
python3 record_benchmark.py --template template.json \
--job ${{ matrix.job }} --instance ${{ github.event.inputs.instance }} \
--model models/test --record ${{ github.event.inputs.record }}
- name: Get serving logs
if: always()
working-directory: tests/integration
run: |
docker rm -f $(docker ps -aq) || true
cat logs/serving.log
- name: Upload test artifacts
uses: actions/upload-artifact@v3
if: always()
with:
name: ${{ matrix.job }}
path: tests/integration
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, environment-setup, benchmark_run ]
steps:
- name: Stop instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.gpu_instance_id }}
./stop_instance.sh $instance_id