Skip to content

Commit

Permalink
update nightly pipeline to publish containers when relevant integ tes…
Browse files Browse the repository at this point in the history
…ts succeed
  • Loading branch information
siddvenk committed Jan 3, 2025
1 parent 2df533b commit 08754b9
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 50 deletions.
84 changes: 42 additions & 42 deletions .github/workflows/docker-nightly-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,48 +118,48 @@ jobs:
echo "DJL_VERSION=$DJL_VERSION" >> $GITHUB_ENV
echo "SERVING_VERSION=$SERVING_VERSION" >> $GITHUB_ENV
echo "DJL_VERSION=$DJL_VERSION" >> $GITHUB_OUTPUT
- name: Build release candidate docker image
if: ${{ inputs.mode == 'release' }}
working-directory: serving/docker
run: |
export BASE_RELEASE_VERSION="${{ env.SERVING_VERSION }}"
export RELEASE_VERSION="${{ env.SERVING_VERSION }}-"
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }} \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }} \
${{ matrix.arch }}
- name: Build temp docker image
if: ${{ inputs.mode == '' || inputs.mode == 'temp' || inputs.mode == 'nightly' }}
run: |
./gradlew --refresh-dependencies :serving:dockerDeb -Psnapshot
cd serving/docker
export NIGHTLY="-nightly"
echo "NIGHTLY=$NIGHTLY" >> $GITHUB_ENV
docker compose build --no-cache \
--build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
--build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
${{ matrix.arch }}
- name: Tag and push temp image to ECR repo
working-directory: serving/docker
run: |
ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
mode=${{ inputs.mode }}
if [[ "${{ inputs.mode }}" == "release" ]]; then
mode=${{ env.DJL_VERSION }}
fi
if [[ -z "${{ inputs.mode }}" ]]; then
mode="nightly"
fi
tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_RUN_ID}"
tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_SHA}"
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempRunIdTag
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempCommitTag
if [[ "$mode" == "nightly" ]]; then
docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-nightly
fi
time docker push --all-tags ${{ env.AWS_ECR_REPO }}
# - name: Build release candidate docker image
# if: ${{ inputs.mode == 'release' }}
# working-directory: serving/docker
# run: |
# export BASE_RELEASE_VERSION="${{ env.SERVING_VERSION }}"
# export RELEASE_VERSION="${{ env.SERVING_VERSION }}-"
# docker compose build --no-cache \
# --build-arg djl_version=${{ env.DJL_VERSION }} \
# --build-arg djl_serving_version=${{ env.SERVING_VERSION }} \
# ${{ matrix.arch }}
# - name: Build temp docker image
# if: ${{ inputs.mode == '' || inputs.mode == 'temp' || inputs.mode == 'nightly' }}
# run: |
# ./gradlew --refresh-dependencies :serving:dockerDeb -Psnapshot
# cd serving/docker
# export NIGHTLY="-nightly"
# echo "NIGHTLY=$NIGHTLY" >> $GITHUB_ENV
# docker compose build --no-cache \
# --build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \
# --build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \
# ${{ matrix.arch }}
# - name: Tag and push temp image to ECR repo
# working-directory: serving/docker
# run: |
# ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
# aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
# mode=${{ inputs.mode }}
# if [[ "${{ inputs.mode }}" == "release" ]]; then
# mode=${{ env.DJL_VERSION }}
# fi
# if [[ -z "${{ inputs.mode }}" ]]; then
# mode="nightly"
# fi
# tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_RUN_ID}"
# tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_SHA}"
#
# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempRunIdTag
# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempCommitTag
# if [[ "$mode" == "nightly" ]]; then
# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-nightly
# fi
# time docker push --all-tags ${{ env.AWS_ECR_REPO }}

stop-runners:
if: always()
Expand Down
65 changes: 62 additions & 3 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@ on:
required: false
type: string
default: ''
outputs:
failure_cpu:
description: 'Any cpu integ test failures'
value: ${{ jobs.test.outputs.failure_cpu }}
failure_gpu:
description: 'Any gpu integ test failures'
value: ${{ jobs.test.outputs.failure_gpu }}
failure_aarch64:
description: 'Any aarch64 integ test failures'
value: ${{ jobs.test.outputs.failure_aarch64 }}
failure_lmi:
description: 'Any lmi integ test failures'
value: ${{ jobs.test.outputs.failure_lmi }}
failure_trtllm:
description: 'Any trtllm integ test failures'
value: ${{ jobs.test.outputs.failure_trtllm }}
failure_neuron:
description: 'Any neuron integ test failures'
value: ${{ jobs.test.outputs.failure_neuron }}
failure_neuron_unittest:
description: 'Any neuron unittest failures'
value: ${{ jobs.transformers-neuronx-container-unit-tests.failure_neuron_unittest }}

permissions:
id-token: write
Expand Down Expand Up @@ -125,45 +147,72 @@ jobs:
- test: TestCpuFull
instance: ubuntu-latest
gh-runner: true
failure-key: cpu
- test: TestCpuBoth
instance: ubuntu-latest
gh-runner: true
failure-key: cpu
- test: TestGpu
instance: g6
failure-key: gpu
- test: TestAarch64
instance: aarch64
failure-key: aarch64
- test: TestHfHandler
instance: g6
failure-key: lmi
- test: TestTrtLlmHandler1
instance: g6
failure-key: trtllm
- test: TestTrtLlmHandler2
instance: g6
failure-key: trtllm
- test: TestSchedulerSingleGPU
instance: g6
failure-key: lmi
- test: TestSchedulerMultiGPU
instance: g6
failure-key: lmi
- test: TestLmiDist1
instance: g6
failure-key: lmi
- test: TestLmiDist2
instance: g6
failure-key: lmi
- test: TestVllm1
instance: g6
failure-key: lmi
- test: TestVllmLora
instance: g6
failure-key: lmi
- test: TestLmiDistLora
instance: g6
failure-key: lmi
- test: TestNeuronx1
instance: inf2
failure-key: neuron
- test: TestNeuronx2
instance: inf2
failure-key: neuron
- test: TestNeuronxRollingBatch
instance: inf2
failure-key: neuron
- test: TestMultiModal
instance: g6
failure-key: lmi
- test: TestTextEmbedding
instance: g6
failure-key: lmi
- test: TestLmiDistPipelineParallel
instance: g6
failure-key: lmi
outputs:
failure_cpu: ${{ steps.test-failure.failure_cpu }}
failure_gpu: ${{ steps.test-failure.failure_gpu }}
failure_aarch64: ${{ steps.test-failure.failure_aarch64 }}
failure_lmi: ${{ steps.test-failure.failure_lmi }}
failure_trtllm: ${{ steps.test-failure.failure_trtllm }}
failure_neuron: ${{ steps.test-failure.failure_neuron }}
steps:
- uses: actions/checkout@v4
- name: Clean env
Expand Down Expand Up @@ -213,22 +262,28 @@ jobs:
OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
run: |
ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
python -m pytest -s -k ${{ matrix.test.test }} tests.py
number=$RANDOM
if [[ $((number % 2)) -eq 0 ]]; then
exit 1
else
exit 0
fi
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
id: test-failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
failure_name="failure_${{ matrix.test.failure-key }}"
echo "::set-output name=$failure_name::1"
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
Expand All @@ -243,6 +298,8 @@ jobs:
- RUN_ID-${{ github.run_id }}
- RUN_NUMBER-${{ github.run_number }}
- SHA-${{ github.sha }}
outputs:
failure_neuron_unittest: ${{ steps.failure_neuron_unittest.failure_neuron_unittest }}
timeout-minutes: 15
needs: create-runners
steps:
Expand Down Expand Up @@ -301,9 +358,11 @@ jobs:
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
- name: On fail step
if: ${{ failure() }}
id: failure_neuron_unittest
working-directory: engines/python/setup
run: |
cat logs/results.log
echo "::set-output name=failure_neuron_unittest::1"
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
Expand Down
17 changes: 12 additions & 5 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,16 @@ jobs:
with:
tag-suffix: ${{ needs.get_image_tag_suffix.outputs.test_image_tag_suffix }}
publish:
runs-on: ubuntu-latest
needs: [integration-test, get_image_tag_suffix]
uses: ./.github/workflows/docker_publish.yml
secrets: inherit
with:
mode: ${{ inputs.mode || 'nightly' }}
commit_sha: ${{ github.sha }}
if: always()
steps:
- name: Publish Test
run: |
echo "lmi status ${{ needs.integration-test.outputs.failure_lmi }}"
echo "trtllm status ${{ needs.integration-test.outputs.failure_trtllm }}"
echo "gpu status ${{ needs.integration-test.outputs.failure_gpu }}"
echo "cpu status ${{ needs.integration-test.outputs.failure_cpu }}"
echo "aarch64 status ${{ needs.integration-test.outputs.failure_aarch64 }}"
echo "neuron status ${{ needs.integration-test.outputs.failure_neuron }}"
echo "neuron unittest status ${{ needs.integration-test.outputs.failure_neuron_unittest }}"

0 comments on commit 08754b9

Please sign in to comment.