Skip to content

Commit

Permalink
do not fail-fast for docker publish (#2642)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk authored Dec 18, 2024
1 parent 92a075f commit eaa70af
Showing 1 changed file with 115 additions and 110 deletions.
225 changes: 115 additions & 110 deletions .github/workflows/docker-nightly-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,100 +22,65 @@ permissions:
contents: read

jobs:
nightly-build:
runs-on: ubuntu-latest
strategy:
matrix:
arch: [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi ]
create-runners:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Clean disk space
- name: Create new CPU instance
id: create_cpu_1
run: |
sudo rm -rf \
/usr/share/dotnet /usr/local/lib/android /opt/ghc \
/usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \
$AGENT_TOOLSDIRECTORY
- uses: actions/checkout@v4
- name: Login to Docker
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: install awscli
run: |
sudo apt-get update
sudo apt-get install awscli -y
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-east-1
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: 'corretto'
java-version: 17
- uses: actions/cache@v4
with:
path: ~/.gradle/caches
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }}
- name: Extract DJL and DJL Serving versions from TOML
id: get-versions
run: |
DJL_VERSION=$(awk -F '=' '/djl / {gsub(/ ?"/, "", $2); print $2}' gradle/libs.versions.toml)
SERVING_VERSION=$(awk -F '=' '/serving / {gsub(/ ?"/, "", $2); print $2}' gradle/libs.versions.toml)
echo "DJL_VERSION=$DJL_VERSION" >> $GITHUB_ENV
echo "SERVING_VERSION=$SERVING_VERSION" >> $GITHUB_ENV
- name: Build serving package for nightly
if: ${{ inputs.mode == '' || inputs.mode == 'nightly' }}
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_2
run: |
./gradlew --refresh-dependencies :serving:dockerDeb -Psnapshot
- name: Build and push nightly docker image
if: ${{ inputs.mode == '' || inputs.mode == 'nightly' }}
working-directory: serving/docker
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_3
run: |
export NIGHTLY="-nightly"
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
${{ matrix.arch }}
docker compose push ${{ matrix.arch }}
- name: Build and push temp image
if: ${{ inputs.mode == 'temp' }}
working-directory: serving/docker
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_4
run: |
export NIGHTLY="-nightly"
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
${{ matrix.arch }}
repo="185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin $repo
tempTag="$repo:${{ matrix.arch }}-${GITHUB_SHA}"
docker tag deepjavalibrary/djl-serving:${{ matrix.arch }}-nightly $tempTag
docker push $tempTag
- name: Build and push release docker image
if: ${{ inputs.mode == 'release' }}
working-directory: serving/docker
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_5
run: |
export BASE_RELEASE_VERSION="${{ env.SERVING_VERSION }}"
export RELEASE_VERSION="${{ env.SERVING_VERSION }}-"
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }} \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }} \
${{ matrix.arch }}
docker compose push ${{ matrix.arch }}
- name: Retag image for release
if: ${{ matrix.arch == 'cpu' && inputs.mode == 'release' }}
working-directory: serving/docker
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu_6
run: |
docker tag deepjavalibrary/djl-serving:${{ env.SERVING_VERSION }} deepjavalibrary/djl-serving:latest
docker push deepjavalibrary/djl-serving:latest
create-runner:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Create new Graviton instance
id: create_aarch64
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create Graviton instance
id: create_graviton_1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
Expand All @@ -124,28 +89,52 @@ jobs:
| jq '.token' | tr -d '"' )
./start_instance.sh action_graviton $token djl-serving
outputs:
aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}

nightly-aarch64:
cpu_instance_id_1: ${{ steps.create_cpu_1.outputs.action_cpu_instance_id }}
cpu_instance_id_2: ${{ steps.create_cpu_2.outputs.action_cpu_instance_id }}
cpu_instance_id_3: ${{ steps.create_cpu_3.outputs.action_cpu_instance_id }}
cpu_instance_id_4: ${{ steps.create_cpu_4.outputs.action_cpu_instance_id }}
cpu_instance_id_5: ${{ steps.create_cpu_5.outputs.action_cpu_instance_id }}
cpu_instance_id_6: ${{ steps.create_cpu_6.outputs.action_cpu_instance_id }}
graviton_instance_id_1: ${{ steps.create_graviton_1.outputs.action_graviton_instance_id }}
nightly-build:
needs: create-runners
strategy:
fail-fast: false
matrix:
containers:
- name: cpu
instance: cpu
- name: cpu-full
instance: cpu
- name: pytorch-inf2
instance: cpu
- name: pytorch-gpu
instance: cpu
- name: tensorrt-llm
instance: cpu
- name: lmi
instance: cpu
- name: aarch64
instance: aarch64
runs-on:
- self-hosted
- aarch64
- ${{ matrix.containers.instance }}
- RUN_ID-${{ github.run_id }}
- RUN_NUMBER-${{ github.run_number }}
- SHA-${{ github.sha }}
timeout-minutes: 60
needs: create-runner
steps:
- uses: actions/checkout@v4
- name: Clean docker env
working-directory: serving/docker
- name: Clean disk space
run: |
yes | docker system prune -a --volumes
sudo rm -rf \
/usr/share/dotnet /usr/local/lib/android /opt/ghc \
/usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \
$AGENT_TOOLSDIRECTORY
- uses: actions/checkout@v4
- name: Login to Docker
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: install awscli
run: |
sudo apt-get update
Expand Down Expand Up @@ -183,8 +172,8 @@ jobs:
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
aarch64
docker compose push aarch64
${{ matrix.containers.name }}
docker compose push ${{ matrix.containers.name }}
- name: Build and push temp image
if: ${{ inputs.mode == 'temp' }}
working-directory: serving/docker
Expand All @@ -193,11 +182,11 @@ jobs:
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
aarch64
${{ matrix.containers.name }}
repo="185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin $repo
tempTag="$repo:aarch64-${GITHUB_SHA}"
docker tag deepjavalibrary/djl-serving:aarch64-nightly $tempTag
tempTag="$repo:${{ matrix.containers.name }}-${GITHUB_SHA}"
docker tag deepjavalibrary/djl-serving:${{ matrix.containers.name }}-nightly $tempTag
docker push $tempTag
- name: Build and push release docker image
if: ${{ inputs.mode == 'release' }}
Expand All @@ -208,17 +197,33 @@ jobs:
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }} \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }} \
aarch64
docker compose push aarch64
stop-runner:
${{ matrix.containers.name }}
docker compose push ${{ matrix.containers.name }}
- name: Retag image for release
if: ${{ matrix.containers.name == 'cpu' && inputs.mode == 'release' }}
working-directory: serving/docker
run: |
docker tag deepjavalibrary/djl-serving:${{ env.SERVING_VERSION }} deepjavalibrary/djl-serving:latest
docker push deepjavalibrary/djl-serving:latest
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [nightly-aarch64, create-runner]
needs: [nightly-build, create-runners]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runner.outputs.aarch64_instance_id }}
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_2 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_3 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_4 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_5 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id_6 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.graviton_instance_id_1 }}
./stop_instance.sh $instance_id

0 comments on commit eaa70af

Please sign in to comment.