Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable selections in docker image build workflow #2643

Merged
merged 13 commits into from
Dec 30, 2024
149 changes: 48 additions & 101 deletions .github/workflows/docker-nightly-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@ on:
description: 'release/nightly/temp, default is nightly'
required: true
default: 'nightly'
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'
workflow_call:
inputs:
mode:
description: 'release/nightly/temp, default is nightly'
type: string
required: true
default: 'nightly'
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'
outputs:
djl_version:
description: "djl version"
Expand All @@ -30,102 +40,45 @@ env:
jobs:
create-runners:
runs-on: [ self-hosted, scheduler ]
strategy:
matrix:
arch: ${{ startsWith(inputs.arch, '[') && fromJson(inputs.arch) || fromJson(format('[{0}]', inputs.arch)) }}
steps:
- name: Create new CPU instance
id: create_cpu_1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_3
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_4
- name: Create new instance
id: create_cpu
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_5
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_6
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create Graviton instance
id: create_graviton_1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_graviton $token djl-serving
instance_type=action_cpu
if [ "${{matrix.arch}}" == "aarch64" ]; then
instance_type=action_graviton
fi
./start_instance.sh $instance_type $token djl-serving
instance_id=`grep "^instance_id=" $GITHUB_OUTPUT | cut -d'=' -f2`
echo "instance_id_${{matrix.arch}}=$instance_id" >>"$GITHUB_OUTPUT"

outputs:
cpu_instance_id_1: ${{ steps.create_cpu_1.outputs.action_cpu_instance_id }}
cpu_instance_id_2: ${{ steps.create_cpu_2.outputs.action_cpu_instance_id }}
cpu_instance_id_3: ${{ steps.create_cpu_3.outputs.action_cpu_instance_id }}
cpu_instance_id_4: ${{ steps.create_cpu_4.outputs.action_cpu_instance_id }}
cpu_instance_id_5: ${{ steps.create_cpu_5.outputs.action_cpu_instance_id }}
cpu_instance_id_6: ${{ steps.create_cpu_6.outputs.action_cpu_instance_id }}
graviton_instance_id_1: ${{ steps.create_graviton_1.outputs.action_graviton_instance_id }}
instance_id_cpu: ${{ steps.create_cpu.outputs.instance_id_cpu }}
instance_id_cpu-full: ${{ steps.create_cpu.outputs.instance_id_cpu-full }}
instance_id_pytorch-inf2: ${{ steps.create_cpu.outputs.instance_id_pytorch-inf2 }}
instance_id_pytorch-gpu: ${{ steps.create_cpu.outputs.instance_id_pytorch-gpu }}
instance_id_tensorrt-llm: ${{ steps.create_cpu.outputs.instance_id_tensorrt-llm }}
instance_id_lmi: ${{ steps.create_cpu.outputs.instance_id_lmi }}
instance_id_aarch64: ${{ steps.create_cpu.outputs.instance_id_aarch64 }}

nightly-build:
needs: create-runners
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
containers:
- name: cpu
instance: cpu
- name: cpu-full
instance: cpu
- name: pytorch-inf2
instance: cpu
- name: pytorch-gpu
instance: cpu
- name: tensorrt-llm
instance: cpu
- name: lmi
instance: cpu
- name: aarch64
instance: aarch64
arch: ${{ startsWith(inputs.arch, '[') && fromJson(inputs.arch) || fromJson(format('[{0}]', inputs.arch)) }}
runs-on:
- self-hosted
- ${{ matrix.containers.instance }}
- ${{ matrix.arch != 'aarch64' && 'cpu' || 'aarch64' }}
- RUN_ID-${{ github.run_id }}
- RUN_NUMBER-${{ github.run_number }}
- SHA-${{ github.sha }}
Expand Down Expand Up @@ -174,7 +127,7 @@ jobs:
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }} \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }} \
${{ matrix.containers.name }}
${{ matrix.arch }}
- name: Build temp docker image
if: ${{ inputs.mode == '' || inputs.mode == 'temp' || inputs.mode == 'nightly' }}
run: |
Expand All @@ -185,7 +138,7 @@ jobs:
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
${{ matrix.containers.name }}
${{ matrix.arch }}
- name: Tag and push temp image to ECR repo
working-directory: serving/docker
run: |
Expand All @@ -195,35 +148,29 @@ jobs:
if [ "${{ inputs.mode }}" == "release" ]; then
mode=${{ env.DJL_VERSION }}
fi
tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.containers.name }}-$mode-${GITHUB_RUN_ID}"
tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.containers.name }}-$mode-${GITHUB_SHA}"
tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_RUN_ID}"
tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_SHA}"

docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.containers.name }}${{ env.NIGHTLY }} $tempRunIdTag
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.containers.name }}${{ env.NIGHTLY }} $tempCommitTag
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempRunIdTag
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempCommitTag
if ${{ inputs.mode == 'nightly' }}; then
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.containers.name }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.containers.name }}-nightly
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-nightly
fi
time docker push --all-tags ${{ env.AWS_ECR_REPO }}

stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [nightly-build, create-runners]
env:
runner_output: ${{ toJson(needs.create-runners.outputs) }}
steps:
- name: Stop all instances
continue-on-error: true
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_2 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_3 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_4 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_5 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_6 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.graviton_instance_id_1 }}
./stop_instance.sh $instance_id
for key in $(echo $runner_output | jq -r 'keys[]'); do
instance_id=$(echo $runner_output | jq -r ".[\"$key\"]")
echo "Key: $key, instance_id: $instance_id"
./stop_instance.sh $instance_id
done
Loading