From 5d02634128c487fb16673cb18b7f4861128d204c Mon Sep 17 00:00:00 2001
From: Zach Kimberg <kimbergz@amazon.com>
Date: Wed, 12 Jun 2024 11:54:37 -0700
Subject: [PATCH 1/2] [CI] Inferentia tests through pytest

This moves the inferentia2 integration tests into the merged suite along with
the rest of the llm integration tests.
---
 .github/workflows/client-test.yml          |   2 +-
 .github/workflows/llm_inf2_integration.yml | 463 ---------------------
 .github/workflows/llm_integration.yml      | 125 +++++-
 tests/{ => integration}/test_client.py     |   7 +-
 tests/integration/tests.py                 | 192 ++++++++-
 5 files changed, 303 insertions(+), 486 deletions(-)
 delete mode 100644 .github/workflows/llm_inf2_integration.yml
 rename tests/{ => integration}/test_client.py (94%)

diff --git a/.github/workflows/client-test.yml b/.github/workflows/client-test.yml
index 188924814..1362636ae 100644
--- a/.github/workflows/client-test.yml
+++ b/.github/workflows/client-test.yml
@@ -62,7 +62,7 @@ jobs:
           cd tests
           djl-serving -m test::Python=file://$PWD/python &> output.log &
           sleep 15
-          python test_client.py
+          python integration/test_client.py
           jobs
           kill %1
       - name: On failure step
diff --git a/.github/workflows/llm_inf2_integration.yml b/.github/workflows/llm_inf2_integration.yml
deleted file mode 100644
index f46a893b0..000000000
--- a/.github/workflows/llm_inf2_integration.yml
+++ /dev/null
@@ -1,463 +0,0 @@
-name: Inferentia2 integration tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      djl-version:
-        description: 'The released version of DJL'
-        required: false
-        default: ''
-  schedule:
-    - cron: '0 15 * * *'
-
-
-jobs:
-  create-runners:
-    runs-on: [self-hosted, scheduler]
-    steps:
-      - name: Create new Inf2.24xl instance
-        id: create_inf2
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_inf2 $token djl-serving
-      - name: Create new Inf2.24xl instance
-        id: create_inf2_2
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_inf2 $token djl-serving
-    outputs:
-      inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
-      inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}
-
-  transformers-neuronx-container-unit-tests:
-    runs-on: [ self-hosted, inf2 ]
-    timeout-minutes: 15
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" pillow wheel
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Run djl_python unit/integration tests on container
-        working-directory: engines/python/setup
-        run: |
-          # Setup
-          pip install setuptools
-          python3 -m setup bdist_wheel
-          mkdir logs
-          docker run -t --rm --network="host" \
-          --name neuron-test \
-          -v $PWD/:/opt/ml/model/ \
-          -w /opt/ml/model \
-          --device=/dev/neuron0:/dev/neuron0 \
-          deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \
-          /bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \
-          pytest djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
-          
-          # Cleanup
-          sudo rm -rf TinyLlama .pytest_cache djl_python
-          
-          # Fail on failed tests
-          if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: engines/python/setup
-        run: |
-          cat logs/results.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: transformers-neuronx-${{ matrix.arch }}-logs
-          path: engines/python/setup/logs/
-
-  transformers-neuronx-test-1:
-    runs-on: [ self-hosted, inf2 ]
-    timeout-minutes: 90
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" pillow
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-          mkdir logs
-          ./download_models.sh pytorch-inf2
-      - name: Test Pytorch model
-        working-directory: tests/integration
-        run: |
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          serve -m test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz
-          ./test_client.sh image/jpg models/kitten.jpg
-          docker rm -f $(docker ps -aq)
-      - name: Test Python mode
-        working-directory: tests/integration
-        run: |
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          serve -m test::Python:nc0=file:/opt/ml/model/resnet18_no_reqs_inf2_2_4.tar.gz
-          ./test_client.sh image/jpg models/kitten.jpg
-          docker rm -f $(docker ps -aq)
-      - name: Test transformers-neuronx gpt2 with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx gpt2
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx gpt2
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx gpt2 quantization with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx gpt2-quantize
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx gpt2-quantize
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx opt-1.3b with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx opt-1.3b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx opt-1.3b
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx gpt-j-6b with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx gpt-j-6b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx gpt-j-6b
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx pythia-2.8b with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx pythia-2.8b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx pythia-2.8b
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx bloom-7b1 with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx bloom-7b1
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx bloom-7b1
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test gpt2 partition
-        working-directory: tests/integration
-        run: |
-          sudo rm -rf models
-          python3 llm/prepare.py transformers_neuronx_aot gpt2
-          # To test the requirements.txt download.
-          echo "dummy_test" >> $PWD/models/test/requirements.txt
-
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          partition --model-dir /opt/ml/input/data/training/ --skip-copy | tee partition_output.log
-
-          # checking if neff files are generated.
-          sudo mv $PWD/models/test/partition-test $PWD/models/
-          if ls $PWD/models/partition-test/compiled/*.neff &>/dev/null; \
-          then echo "compiled files generated"; else exit 1; fi
-          
-          # checking whether requirements.txt download is successful
-          if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
-          then echo "requirements.txt install was successful"; else exit 1; fi
-          if [ -d models ]; then sudo rm -rf models; fi
-      - name: Test gpt2-quantize partition
-        working-directory: tests/integration
-        run: |
-          sudo rm -rf models
-          python3 llm/prepare.py transformers_neuronx_aot gpt2-quantize
-          # To test the requirements.txt download.
-          echo "dummy_test" >> $PWD/models/test/requirements.txt
-
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          partition --model-dir /opt/ml/input/data/training/ --skip-copy | tee partition_output.log
-
-          # checking if neff files are generated.
-          sudo mv $PWD/models/test/partition-test $PWD/models/
-          if ls $PWD/models/partition-test/compiled/*.neff &>/dev/null; \
-          then echo "compiled files generated"; else exit 1; fi
-          
-          # checking whether requirements.txt download is successful
-          if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
-          then echo "requirements.txt install was successful"; else exit 1; fi
-          if [ -d models ]; then sudo rm -rf models; fi
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          if [ -d models ]; then sudo rm -rf models; fi
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: transformers-neuronx-${{ matrix.arch }}-logs
-          path: tests/integration/logs/
-
-  transformers-neuronx-test-2:
-    runs-on: [ self-hosted, inf2 ]
-    timeout-minutes: 90
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" pillow
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-          mkdir logs
-          ./download_models.sh pytorch-inf2
-      - name: Test streaming transformers-neuronx opt-1.3b with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx opt-1.3b-streaming
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx opt-1.3b-streaming
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test stable diffusion 1.5 with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx stable-diffusion-1.5-neuron
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py neuron-stable-diffusion stable-diffusion-1.5-neuron
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test stable diffusion bf16 with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx stable-diffusion-2.1-neuron
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py neuron-stable-diffusion stable-diffusion-2.1-neuron
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test stable diffusion xl with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx stable-diffusion-xl-neuron
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py neuron-stable-diffusion stable-diffusion-xl-neuron
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test mistral 7B with handler
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx mistral-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx mistral-7b
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          if [ -d models ]; then sudo rm -rf models; fi
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: transformers-neuronx-${{ matrix.arch }}-logs
-          path: tests/integration/logs/
-
-  transformers-neuronx-rolling-batch:
-    runs-on: [ self-hosted, inf2 ]
-    timeout-minutes: 90
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up JDK 17
-        uses: actions/setup-java@v4
-        with:
-          distribution: 'corretto'
-          java-version: 17
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" pillow
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-          mkdir logs
-          ./download_models.sh pytorch-inf2
-      - name: Test transformers-neuronx llama-7b rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx llama-7b-rb
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-2 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch llama-7b-rb
-          docker rm -f $(docker ps -aq)
-      - name: Test transformers-neuronx tiny-llama vllm model load and rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx tiny-llama-rb-vllm
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch tiny-llama-rb-vllm
-          docker rm -f $(docker ps -aq)
-      - name: Test transformers-neuronx llama-3-8b vllm rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx llama-3-8b-rb-vllm
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-4 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch llama-3-8b-rb-vllm
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx mixtral-8x-7b rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx mixtral-8x7b-rb
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-4 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch mixtral-8x7b-rb
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx llama-2-13b-speculative-rb rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx llama-speculative-rb
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch llama-speculative-rb
-          docker rm -f $(docker ps -aq)
-          sudo rm -rf models
-      - name: Test transformers-neuronx llama-2-13b-speculative-rb compiled draft model rolling batch
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py transformers_neuronx llama-speculative-compiled-rb
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-6 \
-          serve
-          curl http://127.0.0.1:8080/models
-          python3 llm/client.py transformers_neuronx_rolling_batch llama-speculative-compiled-rb
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          if [ -d models ]; then sudo rm -rf models; fi
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: transformers-neuronx-${{ matrix.arch }}-logs
-          path: tests/integration/logs/
-
-  stop-runners:
-    if: always()
-    runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, transformers-neuronx-container-unit-tests, transformers-neuronx-test-1, transformers-neuronx-test-2, transformers-neuronx-rolling-batch ]
-    steps:
-      - name: Stop all instances
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
-          ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
-          ./stop_instance.sh $instance_id
diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
index a74879060..d31503595 100644
--- a/.github/workflows/llm_integration.yml
+++ b/.github/workflows/llm_integration.yml
@@ -42,29 +42,65 @@ jobs:
           --fail \
           | jq '.token' | tr -d '"' )
           ./start_instance.sh action_g6 $token djl-serving
+      - name: Create new Inf2.24xl instance
+        id: create_inf2
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_inf2 $token djl-serving
+      - name: Create new Inf2.24xl instance
+        id: create_inf2_2
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_inf2 $token djl-serving
     outputs:
       gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
       gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
       gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
+      inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
+      inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}
 
   test:
-    runs-on: [ self-hosted, g6 ]
+    runs-on: [ self-hosted, "${{ matrix.test.instance }}" ]
     timeout-minutes: 60
     needs: create-runners
     strategy:
       fail-fast: false
       matrix:
         test:
-          - TestHfHandler
-          - TestTrtLlmHandler1
-          - TestTrtLlmHandler2
-          - TestSchedulerSingleGPU
-          - TestSchedulerMultiGPU
-          - TestLmiDist1
-          - TestLmiDist2
-          - TestVllm1
-          - TestVllmLora
-          - TestLmiDistLora
+          - test: TestHfHandler
+            instance: g6
+          - test: TestTrtLlmHandler1
+            instance: g6
+          - test: TestTrtLlmHandler2
+            instance: g6
+          - test: TestSchedulerSingleGPU
+            instance: g6
+          - test: TestSchedulerMultiGPU
+            instance: g6
+          - test: TestLmiDist1
+            instance: g6
+          - test: TestLmiDist2
+            instance: g6
+          - test: TestVllm1
+            instance: g6
+          - test: TestVllmLora
+            instance: g6
+          - test: TestLmiDistLora
+            instance: g6
+          - test: TestNeuronx1
+            instance: inf2
+          - test: TestNeuronx2
+            instance: inf2
+          - test: TestNeuronxRollingBatch
+            instance: inf2
     steps:
       - uses: actions/checkout@v4
       - name: Clean env
@@ -78,7 +114,7 @@ jobs:
         with:
           python-version: '3.10.x'
       - name: Install pip dependencies
-        run: pip3 install pytest requests "numpy<2" huggingface_hub
+        run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
       - name: Install awscurl
         working-directory: tests/integration
         run: |
@@ -90,7 +126,7 @@ jobs:
         env:
           TEST_DJL_VERSION: ${{ inputs.djl-version }}
         run: |
-          pytest -k ${{ matrix.test }} tests.py
+          pytest -k ${{ matrix.test.test }} tests.py
       - name: Cleanup
         working-directory: tests/integration
         run: |
@@ -108,13 +144,68 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: test-${{ matrix.test }}-logs
+          name: test-${{ matrix.test.test }}-logs
           path: tests/integration/all_logs/
 
+  transformers-neuronx-container-unit-tests:
+    runs-on: [ self-hosted, inf2 ]
+    timeout-minutes: 15
+    needs: create-runners
+    steps:
+      - uses: actions/checkout@v4
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Set up Python3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy pillow wheel
+      - name: Build container name
+        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
+      - name: Download models and dockers
+        run: |
+          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
+      - name: Run djl_python unit/integration tests on container
+        working-directory: engines/python/setup
+        run: |
+          # Setup
+          pip install setuptools
+          python3 -m setup bdist_wheel
+          mkdir logs
+          docker run -t --rm --network="host" \
+          --name neuron-test \
+          -v $PWD/:/opt/ml/model/ \
+          -w /opt/ml/model \
+          --device=/dev/neuron0:/dev/neuron0 \
+          deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \
+          /bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \
+          pytest djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
+          
+          # Cleanup
+          sudo rm -rf TinyLlama .pytest_cache djl_python
+          
+          # Fail on failed tests
+          if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
+      - name: On fail step
+        if: ${{ failure() }}
+        working-directory: engines/python/setup
+        run: |
+          cat logs/results.log
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: transformers-neuronx-${{ matrix.arch }}-logs
+          path: engines/python/setup/logs/
+
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, test]
+    needs: [ create-runners, test, transformers-neuronx-container-unit-tests]
     steps:
       - name: Stop all instances
         run: |
@@ -125,3 +216,7 @@ jobs:
           ./stop_instance.sh $instance_id
           instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
           ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
+          ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
+          ./stop_instance.sh $instance_id
diff --git a/tests/test_client.py b/tests/integration/test_client.py
similarity index 94%
rename from tests/test_client.py
rename to tests/integration/test_client.py
index f5da2480f..ac8ebe869 100644
--- a/tests/test_client.py
+++ b/tests/integration/test_client.py
@@ -67,5 +67,10 @@ def test_image(self):
         self.assertEqual(res.content, img.tobytes())
 
 
+def run():
+    suite = unittest.defaultTestLoader.loadTestsFromTestCase(TestInputOutput)
+    unittest.TextTestRunner().run(suite)
+
+
 if __name__ == '__main__':
-    unittest.main(verbosity=2)
+    run()
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 93ec1c450..c8efa5c93 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -2,16 +2,18 @@
 
 import os
 import subprocess
+import pytest
 import llm.prepare as prepare
 import llm.client as client
 import rb_client as rb_client
+import test_client
 
 djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()
 
 
 class Runner:
 
-    def __init__(self, container, test_name=None):
+    def __init__(self, container, test_name=None, download=False):
         self.container = container
         self.test_name = test_name
 
@@ -26,9 +28,13 @@ def __init__(self, container, test_name=None):
 
         self.image = f"deepjavalibrary/djl-serving:{flavor}"
 
-    def __enter__(self):
         # os.system(f'docker pull {self.image}')
         os.system('rm -rf models')
+
+        if download:
+            os.system(f"./download_models.sh {self.container}")
+
+    def __enter__(self):
         return self
 
     def __exit__(self, *args):
@@ -39,7 +45,7 @@ def __exit__(self, *args):
         subprocess.run(["./remove_container.sh"], check=True)
         os.system("cat logs/serving.log")
 
-    def launch(self, env_vars=None, cmd=None):
+    def launch(self, env_vars=None, container=None, cmd=None):
         if env_vars is not None:
             with open("docker_env", "w") as f:
                 f.write(env_vars)
@@ -47,14 +53,18 @@ def launch(self, env_vars=None, cmd=None):
             if os.path.isfile("docker_env"):
                 os.remove("docker_env")
 
+        if container is None:
+            container = self.container
+
         if cmd is None:
             cmd = 'serve -m test=file:/opt/ml/model/test/'
 
         model_dir = os.path.join(os.getcwd(), 'models')
-        subprocess.run(
-            f'./launch_container.sh {self.image} {model_dir} {self.container} {cmd}'
+        return subprocess.run(
+            f'./launch_container.sh {self.image} {model_dir} {container} {cmd}'
             .split(),
-            check=True)
+            check=True,
+            capture_output=True)
 
 
 class TestHfHandler:
@@ -430,3 +440,173 @@ def test_lora_llama3_8b(self):
             prepare.build_lmi_dist_model("llama3-8b-unmerged-lora")
             r.launch()
             client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split())
+
+
+class TestNeuronx1:
+    # Runs on inf2.24xl
+
+    def test_python_mode(self):
+        with Runner('pytorch-inf2', 'test_python_mode', download=True) as r:
+            r.launch(
+                cmd=
+                'serve -m test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz',
+                container='pytorch-inf2-1')
+            test_client.run()
+
+    def test_gpt2(self):
+        with Runner('pytorch-inf2', 'gpt2') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt2")
+            r.launch(container='pytorch-inf2-1')
+            client.run("transformers_neuronx gpt2".split())
+
+    def test_gpt2_quantize(self):
+        with Runner('pytorch-inf2', 'gpt2-quantize') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt2-quantize")
+            r.launch(container='pytorch-inf2-1')
+            client.run("transformers_neuronx gpt2-quantize".split())
+
+    def test_opt_1_3b(self):
+        with Runner('pytorch-inf2', 'opt-1.3b') as r:
+            prepare.build_transformers_neuronx_handler_model("opt-1.3b")
+            r.launch(container='pytorch-inf2-6')
+            client.run("transformers_neuronx opt-1.3b".split())
+
+    def test_gpt_j_6b(self):
+        with Runner('pytorch-inf2', 'gpt-j-6b') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt-j-6b")
+            r.launch(container='pytorch-inf2-6')
+            client.run("transformers_neuronx gpt-j-6b".split())
+
+    def test_pythia(self):
+        with Runner('pytorch-inf2', 'pythia-2.8b') as r:
+            prepare.build_transformers_neuronx_handler_model("pythia-2.8b")
+            r.launch(container='pytorch-inf2-2')
+            client.run("transformers_neuronx pythia-2.8b".split())
+
+    def test_bloom(self):
+        with Runner('pytorch-inf2', 'bloom-7b1') as r:
+            prepare.build_transformers_neuronx_handler_model("bloom-7b1")
+            r.launch(container='pytorch-inf2-2')
+            client.run("transformers_neuronx bloom-7b1".split())
+
+    @pytest.mark.parametrize("model", ["gpt2", "gpt2-quantize"])
+    def test_partition(self, model):
+        try:
+            with Runner('pytorch-inf2', f'partition-{model}') as r:
+                prepare.build_transformers_neuronx_handler_model(model)
+                with open("models/test/requirements.txt", "a") as f:
+                    f.write("dummy_test")
+                partition_output = r.launch(
+                    container="pytorch-inf2-1",
+                    cmd=
+                    'partition --model-dir /opt/ml/input/data/training/ --save-mp-checkpoint-path /opt/ml/input/data/training/partition --skip-copy'
+                )
+
+                # Check if neff files are generated
+                if len([
+                        fn
+                        for fn in os.listdir("models/test/partition/compiled")
+                        if fn.endswith(".neff")
+                ]) == 0:
+                    raise Exception("Failed to generate any .neff files")
+
+                # Check whether requirements.txt download is sufficient
+                if 'pip install requirements succeed!' not in partition_output.stdout.decode(
+                        "utf-8"):
+                    raise Exception(
+                        "Requirements.txt not installed successfully")
+        finally:
+            os.system('sudo rm -rf models')
+
+
+class TestNeuronx2:
+    # Runs on inf2.24xl
+
+    def test_stream_opt(self):
+        with Runner('pytorch-inf2', 'opt-1.3b-streaming') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "opt-1.3b-streaming")
+            r.launch(container='pytorch-inf2-6')
+            client.run("transformers_neuronx opt-1.3b-streaming".split())
+
+    def test_mistral(self):
+        with Runner('pytorch-inf2', 'mistral-7b') as r:
+            prepare.build_transformers_neuronx_handler_model("mistral-7b")
+            r.launch(container='pytorch-inf2-2')
+            client.run("transformers_neuronx mistral-7b".split())
+
+    def test_stable_diffusion_1_5(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-1.5-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "stable-diffusion-1.5-neuron")
+            r.launch(container='pytorch-inf2-2')
+            client.run(
+                "neuron-stable-diffusion stable-diffusion-1.5-neuron".split())
+
+    def test_stable_diffusion_2_1(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-2.1-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "stable-diffusion-2.1-neuron")
+            r.launch(container='pytorch-inf2-2')
+            client.run(
+                "neuron-stable-diffusion stable-diffusion-2.1-neuron".split())
+
+    def test_stable_diffusion_xl(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-xl-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "stable-diffusion-xl-neuron")
+            r.launch(container='pytorch-inf2-2')
+            client.run(
+                "neuron-stable-diffusion stable-diffusion-xl-neuron".split())
+
+
+class TestNeuronxRollingBatch:
+    # Runs on inf2.24xl
+
+    def test_llama_7b(self):
+        with Runner('pytorch-inf2', 'llama-7b-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("llama-7b-rb")
+            r.launch(container='pytorch-inf2-2')
+            client.run(
+                "transformers_neuronx_rolling_batch llama-7b-rb".split())
+
+    def test_tiny_llama_vllm(self):
+        with Runner('pytorch-inf2', 'tiny-llama-rb-vllm') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "tiny-llama-rb-vllm")
+            r.launch(container='pytorch-inf2-1')
+            client.run("transformers_neuronx_rolling_batch tiny-llama-rb-vllm".
+                       split())
+
+    def test_llama3_vllm(self):
+        with Runner('pytorch-inf2', 'llama-3-8b-rb-vllm') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "llama-3-8b-rb-vllm")
+            r.launch(container='pytorch-inf2-4')
+            client.run("transformers_neuronx_rolling_batch llama-3-8b-rb-vllm".
+                       split())
+
+    def test_mixtral(self):
+        with Runner('pytorch-inf2', 'mixtral-8x7b-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("mixtral-8x7b-rb")
+            r.launch(container='pytorch-inf2-4')
+            client.run(
+                "transformers_neuronx_rolling_batch mixtral-8x7b-rb".split())
+
+    def test_llama_speculative(self):
+        with Runner('pytorch-inf2', 'llama-speculative-rb') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "llama-speculative-rb")
+            r.launch(container='pytorch-inf2-6')
+            client.run(
+                "transformers_neuronx_rolling_batch llama-speculative-rb".
+                split())
+
+    def test_llama_speculative_compiled(self):
+        with Runner('pytorch-inf2', 'llama-speculative-compiled-rb') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "llama-speculative-compiled-rb")
+            r.launch(container='pytorch-inf2-6')
+            client.run(
+                "transformers_neuronx_rolling_batch llama-speculative-compiled-rb"
+                .split())

From dda02c228bc54e210d091d3cb09ea6b499b46718 Mon Sep 17 00:00:00 2001
From: Zach Kimberg <kimbergz@amazon.com>
Date: Tue, 25 Jun 2024 15:41:42 -0700
Subject: [PATCH 2/2] Fix

---
 .github/workflows/client-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/client-test.yml b/.github/workflows/client-test.yml
index 1362636ae..3502fd244 100644
--- a/.github/workflows/client-test.yml
+++ b/.github/workflows/client-test.yml
@@ -109,7 +109,7 @@ jobs:
           ./gradlew --stop
           ./gradlew :serving:run --args="-m test::Python=file:$(pwd -W)/tests/python" &> output.log &
           sleep 30
-          cd tests/ && python test_client.py
+          cd tests/ && python integration/test_client.py
       - name: On failure step
         if: ${{ failure() }}
         shell: bash