diff --git a/.github/workflows/docker-nightly-publish.yml b/.github/workflows/docker-nightly-publish.yml index 778b2e2aa..34d46e192 100644 --- a/.github/workflows/docker-nightly-publish.yml +++ b/.github/workflows/docker-nightly-publish.yml @@ -118,48 +118,48 @@ jobs: echo "DJL_VERSION=$DJL_VERSION" >> $GITHUB_ENV echo "SERVING_VERSION=$SERVING_VERSION" >> $GITHUB_ENV echo "DJL_VERSION=$DJL_VERSION" >> $GITHUB_OUTPUT - - name: Build release candidate docker image - if: ${{ inputs.mode == 'release' }} - working-directory: serving/docker - run: | - export BASE_RELEASE_VERSION="${{ env.SERVING_VERSION }}" - export RELEASE_VERSION="${{ env.SERVING_VERSION }}-" - docker compose build --no-cache \ - --build-arg djl_version=${{ env.DJL_VERSION }} \ - --build-arg djl_serving_version=${{ env.SERVING_VERSION }} \ - ${{ matrix.arch }} - - name: Build temp docker image - if: ${{ inputs.mode == '' || inputs.mode == 'temp' || inputs.mode == 'nightly' }} - run: | - ./gradlew --refresh-dependencies :serving:dockerDeb -Psnapshot - cd serving/docker - export NIGHTLY="-nightly" - echo "NIGHTLY=$NIGHTLY" >> $GITHUB_ENV - docker compose build --no-cache \ - --build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \ - --build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \ - ${{ matrix.arch }} - - name: Tag and push temp image to ECR repo - working-directory: serving/docker - run: | - ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}') - aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}} - mode=${{ inputs.mode }} - if [[ "${{ inputs.mode }}" == "release" ]]; then - mode=${{ env.DJL_VERSION }} - fi - if [[ -z "${{ inputs.mode }}" ]]; then - mode="nightly" - fi - tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_RUN_ID}" - tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_SHA}" - - docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempRunIdTag - docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempCommitTag - if [[ "$mode" == "nightly" ]]; then - docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-nightly - fi - time docker push --all-tags ${{ env.AWS_ECR_REPO }} +# - name: Build release candidate docker image +# if: ${{ inputs.mode == 'release' }} +# working-directory: serving/docker +# run: | +# export BASE_RELEASE_VERSION="${{ env.SERVING_VERSION }}" +# export RELEASE_VERSION="${{ env.SERVING_VERSION }}-" +# docker compose build --no-cache \ +# --build-arg djl_version=${{ env.DJL_VERSION }} \ +# --build-arg djl_serving_version=${{ env.SERVING_VERSION }} \ +# ${{ matrix.arch }} +# - name: Build temp docker image +# if: ${{ inputs.mode == '' || inputs.mode == 'temp' || inputs.mode == 'nightly' }} +# run: | +# ./gradlew --refresh-dependencies :serving:dockerDeb -Psnapshot +# cd serving/docker +# export NIGHTLY="-nightly" +# echo "NIGHTLY=$NIGHTLY" >> $GITHUB_ENV +# docker compose build --no-cache \ +# --build-arg djl_version=${{ env.DJL_VERSION }}-SNAPSHOT \ +# --build-arg djl_serving_version=${{ env.SERVING_VERSION }}-SNAPSHOT \ +# ${{ matrix.arch }} +# - name: Tag and push temp image to ECR repo +# working-directory: serving/docker +# run: | +# ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}') +# aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}} +# mode=${{ inputs.mode }} +# if [[ "${{ inputs.mode }}" == "release" ]]; then +# mode=${{ env.DJL_VERSION }} +# fi +# if [[ -z "${{ inputs.mode }}" ]]; then +# mode="nightly" +# fi +# tempRunIdTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_RUN_ID}" +# tempCommitTag="${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-$mode-${GITHUB_SHA}" +# +# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempRunIdTag +# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} $tempCommitTag +# if [[ "$mode" == "nightly" ]]; then +# docker tag ${{ env.DOCKER_HUB_REPO }}:${{ matrix.arch }}${{ env.NIGHTLY }} ${{ env.AWS_ECR_REPO }}:${{ matrix.arch }}-nightly +# fi +# time docker push --all-tags ${{ env.AWS_ECR_REPO }} stop-runners: if: always() diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 17781a671..87f699e36 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -24,6 +24,28 @@ on: required: false type: string default: '' + outputs: + failure_cpu: + description: 'Any cpu integ test failures' + value: ${{ jobs.test.outputs.failure_cpu }} + failure_gpu: + description: 'Any gpu integ test failures' + value: ${{ jobs.test.outputs.failure_gpu }} + failure_aarch64: + description: 'Any aarch64 integ test failures' + value: ${{ jobs.test.outputs.failure_aarch64 }} + failure_lmi: + description: 'Any lmi integ test failures' + value: ${{ jobs.test.outputs.failure_lmi }} + failure_trtllm: + description: 'Any trtllm integ test failures' + value: ${{ jobs.test.outputs.failure_trtllm }} + failure_neuron: + description: 'Any neuron integ test failures' + value: ${{ jobs.test.outputs.failure_neuron }} + failure_neuron_unittest: + description: 'Any neuron unittest failures' + value: ${{ jobs.transformers-neuronx-container-unit-tests.failure_neuron_unittest }} permissions: id-token: write @@ -125,45 +147,72 @@ jobs: - test: TestCpuFull instance: ubuntu-latest gh-runner: true + failure-key: cpu - test: TestCpuBoth instance: ubuntu-latest gh-runner: true + failure-key: cpu - test: TestGpu instance: g6 + failure-key: gpu - test: TestAarch64 instance: aarch64 + failure-key: aarch64 - test: TestHfHandler instance: g6 + failure-key: lmi - test: TestTrtLlmHandler1 instance: g6 + failure-key: trtllm - test: TestTrtLlmHandler2 instance: g6 + failure-key: trtllm - test: TestSchedulerSingleGPU instance: g6 + failure-key: lmi - test: TestSchedulerMultiGPU instance: g6 + failure-key: lmi - test: TestLmiDist1 instance: g6 + failure-key: lmi - test: TestLmiDist2 instance: g6 + failure-key: lmi - test: TestVllm1 instance: g6 + failure-key: lmi - test: TestVllmLora instance: g6 + failure-key: lmi - test: TestLmiDistLora instance: g6 + failure-key: lmi - test: TestNeuronx1 instance: inf2 + failure-key: neuron - test: TestNeuronx2 instance: inf2 + failure-key: neuron - test: TestNeuronxRollingBatch instance: inf2 + failure-key: neuron - test: TestMultiModal instance: g6 + failure-key: lmi - test: TestTextEmbedding instance: g6 + failure-key: lmi - test: TestLmiDistPipelineParallel instance: g6 + failure-key: lmi + outputs: + failure_cpu: ${{ steps.test-failure.failure_cpu }} + failure_gpu: ${{ steps.test-failure.failure_gpu }} + failure_aarch64: ${{ steps.test-failure.failure_aarch64 }} + failure_lmi: ${{ steps.test-failure.failure_lmi }} + failure_trtllm: ${{ steps.test-failure.failure_trtllm }} + failure_neuron: ${{ steps.test-failure.failure_neuron }} steps: - uses: actions/checkout@v4 - name: Clean env @@ -213,15 +262,19 @@ jobs: OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }} IMAGE_REPO: ${{ env.AWS_ECR_REPO }} run: | - ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}') - aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}} - python -m pytest -s -k ${{ matrix.test.test }} tests.py + number=$RANDOM + if [[ $((number % 2)) -eq 0 ]]; then + exit 1 + else + exit 0 + fi - name: Cleanup working-directory: tests/integration run: | rm -rf outputs rm awscurl - name: On Failure + id: test-failure if: ${{ failure() }} working-directory: tests/integration run: | @@ -229,6 +282,8 @@ jobs: sudo rm -rf outputs && sudo rm -rf models rm awscurl ./remove_container.sh + failure_name="failure_${{ matrix.test.failure-key }}" + echo "::set-output name=$failure_name::1" - name: Upload test logs if: ${{ always() }} uses: actions/upload-artifact@v4 @@ -243,6 +298,8 @@ jobs: - RUN_ID-${{ github.run_id }} - RUN_NUMBER-${{ github.run_number }} - SHA-${{ github.sha }} + outputs: + failure_neuron_unittest: ${{ steps.failure_neuron_unittest.failure_neuron_unittest }} timeout-minutes: 15 needs: create-runners steps: @@ -301,9 +358,11 @@ jobs: if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi - name: On fail step if: ${{ failure() }} + id: failure_neuron_unittest working-directory: engines/python/setup run: | cat logs/results.log + echo "::set-output name=failure_neuron_unittest::1" - name: Upload test logs uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 4e940e873..558a73910 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -56,9 +56,16 @@ jobs: with: tag-suffix: ${{ needs.get_image_tag_suffix.outputs.test_image_tag_suffix }} publish: + runs-on: ubuntu-latest needs: [integration-test, get_image_tag_suffix] - uses: ./.github/workflows/docker_publish.yml - secrets: inherit - with: - mode: ${{ inputs.mode || 'nightly' }} - commit_sha: ${{ github.sha }} + if: always() + steps: + - name: Publish Test + run: | + echo "lmi status ${{ needs.integration-test.outputs.failure_lmi }}" + echo "trtllm status ${{ needs.integration-test.outputs.failure_trtllm }}" + echo "gpu status ${{ needs.integration-test.outputs.failure_gpu }}" + echo "cpu status ${{ needs.integration-test.outputs.failure_cpu }}" + echo "aarch64 status ${{ needs.integration-test.outputs.failure_aarch64 }}" + echo "neuron status ${{ needs.integration-test.outputs.failure_neuron }}" + echo "neuron unittest status ${{ needs.integration-test.outputs.failure_neuron_unittest }}"