From 25a8e05f3d0713bf65a596cfdf6182e29e44f7a7 Mon Sep 17 00:00:00 2001
From: Zach Kimberg <kimbergz@amazon.com>
Date: Wed, 12 Jun 2024 11:54:37 -0700
Subject: [PATCH] [CI] Inferentia tests through pytest

---
 .github/workflows/llm_integration.yml | 169 ++++++++++++++++++++------
 tests/integration/tests.py            | 158 +++++++++++++++++++++++-
 tests/test_client.py                  |   6 +-
 3 files changed, 290 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
index 02783f2393..705df07e80 100644
--- a/.github/workflows/llm_integration.yml
+++ b/.github/workflows/llm_integration.yml
@@ -15,56 +15,92 @@ jobs:
   create-runners:
     runs-on: [self-hosted, scheduler]
     steps:
-      - name: Create new G6 instance
-        id: create_gpu
+#      - name: Create new G6 instance
+#        id: create_gpu
+#        run: |
+#          cd /home/ubuntu/djl_benchmark_script/scripts
+#          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+#          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+#          --fail \
+#          | jq '.token' | tr -d '"' )
+#          ./start_instance.sh action_g6 $token djl-serving
+#      - name: Create new G6 instance
+#        id: create_gpu2
+#        run: |
+#          cd /home/ubuntu/djl_benchmark_script/scripts
+#          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+#          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+#          --fail \
+#          | jq '.token' | tr -d '"' )
+#          ./start_instance.sh action_g6 $token djl-serving
+#      - name: Create new G6 instance
+#        id: create_gpu3
+#        run: |
+#          cd /home/ubuntu/djl_benchmark_script/scripts
+#          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+#          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+#          --fail \
+#          | jq '.token' | tr -d '"' )
+#          ./start_instance.sh action_g6 $token djl-serving
+      - name: Create new Inf2.24xl instance
+        id: create_inf2
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
           https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
           --fail \
           | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
-      - name: Create new G6 instance
-        id: create_gpu2
+          ./start_instance.sh action_inf2 $token djl-serving
+      - name: Create new Inf2.24xl instance
+        id: create_inf2_2
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
           https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
           --fail \
           | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
-      - name: Create new G6 instance
-        id: create_gpu3
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
+          ./start_instance.sh action_inf2 $token djl-serving
     outputs:
-      gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
-      gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
-      gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
+#      gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
+#      gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
+#      gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
+      inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
+      inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}
 
   test:
-    runs-on: [ self-hosted, g6 ]
+    runs-on: [ self-hosted, ${{ matrix.test.instance}}  ]
     timeout-minutes: 60
     needs: create-runners
     strategy:
       fail-fast: false
       matrix:
         test:
-          - TestHfHandler
-          - TestTrtLlmHandler1
-          - TestTrtLlmHandler2
-          - TestSchedulerSingleGPU
-          - TestSchedulerMultiGPU
-          - TestLmiDist1
-          - TestLmiDist2
-          - TestVllm1
-          - TestVllmLora
-          - TestLmiDistLora
+#          - test: TestHfHandler
+#            instance: g6
+#          - test: TestTrtLlmHandler1
+#            instance: g6
+#          - test: TestTrtLlmHandler2
+#            instance: g6
+#          - test: TestSchedulerSingleGPU
+#            instance: g6
+#          - test: TestSchedulerMultiGPU
+#            instance: g6
+#          - test: TestLmiDist1
+#            instance: g6
+#          - test: TestLmiDist2
+#            instance: g6
+#          - test: TestVllm1
+#            instance: g6
+#          - test: TestVllmLora
+#            instance: g6
+#          - test: TestLmiDistLora
+#            instance: g6
+          - test: TestNeuronx1
+            instance: inf2
+          - test: TestNeuronx2
+            instance: inf2
+          - test: TestNeuronxRollingBatch
+            instance: inf2
     steps:
       - uses: actions/checkout@v4
       - name: Clean env
@@ -78,7 +114,7 @@ jobs:
         with:
           python-version: '3.10.x'
       - name: Install pip dependencies
-        run: pip3 install pytest requests numpy huggingface_hub
+        run: pip3 install pytest requests numpy pillow huggingface_hub
       - name: Install awscurl
         working-directory: tests/integration
         run: |
@@ -90,7 +126,7 @@ jobs:
         env:
           TEST_DJL_VERSION: ${{ inputs.djl-version }}
         run: |
-          pytest -k ${{ matrix.test }} tests.py
+          pytest -k ${{ matrix.test.test }} tests.py
       - name: Cleanup
         working-directory: tests/integration
         run: |
@@ -108,20 +144,79 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: test-${{ matrix.test }}-logs
+          name: test-${{ matrix.test.test }}-logs
           path: tests/integration/all_logs/
 
+  transformers-neuronx-container-unit-tests:
+    runs-on: [ self-hosted, inf2 ]
+    timeout-minutes: 15
+    needs: create-runners
+    steps:
+      - uses: actions/checkout@v4
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Set up Python3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy pillow wheel
+      - name: Build container name
+        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
+      - name: Download models and dockers
+        run: |
+          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
+      - name: Run djl_python unit/integration tests on container
+        working-directory: engines/python/setup
+        run: |
+          # Setup
+          pip install setuptools
+          python3 -m setup bdist_wheel
+          mkdir logs
+          docker run -t --rm --network="host" \
+          --name neuron-test \
+          -v $PWD/:/opt/ml/model/ \
+          -w /opt/ml/model \
+          --device=/dev/neuron0:/dev/neuron0 \
+          deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \
+          /bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \
+          pytest djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
+          
+          # Cleanup
+          sudo rm -rf TinyLlama .pytest_cache djl_python
+          
+          # Fail on failed tests
+          if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
+      - name: On fail step
+        if: ${{ failure() }}
+        working-directory: engines/python/setup
+        run: |
+          cat logs/results.log
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: transformers-neuronx-${{ matrix.arch }}-logs
+          path: engines/python/setup/logs/
+
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, test]
+    needs: [ create-runners, test, transformers-neuronx-container-unit-tests]
     steps:
       - name: Stop all instances
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
-          ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
+#          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
+#          ./stop_instance.sh $instance_id
+#          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
+#          ./stop_instance.sh $instance_id
+#          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
+#          ./stop_instance.sh $instance_id
+          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
           ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
+          instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
           ./stop_instance.sh $instance_id
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 089de8d969..e83afc8243 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -2,16 +2,18 @@
 
 import os
 import subprocess
+import pytest
 import llm.prepare as prepare
 import llm.client as client
 import rb_client as rb_client
+import test_client
 
 djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()
 
 
 class Runner:
 
-    def __init__(self, container, test_name=None):
+    def __init__(self, container, test_name=None, download=False):
         self.container = container
         self.test_name = test_name
 
@@ -26,6 +28,10 @@ def __init__(self, container, test_name=None):
 
         self.image = f"deepjavalibrary/djl-serving:{flavor}"
 
+        if download:
+            if not os.path.exists("models"):
+                os.system(f"./download_models.sh {self.container}")
+
     def __enter__(self):
         # os.system(f'docker pull {self.image}')
         os.system('rm -rf models')
@@ -39,16 +45,22 @@ def __exit__(self, *args):
         subprocess.run(["./remove_container.sh"], check=True)
         os.system("cat logs/serving.log")
 
-    def launch(self, env_vars=None):
+    def launch(self, env_vars=None, model=None, cmd=None):
         if env_vars is not None:
             with open("docker_env", "w") as f:
                 f.write(env_vars)
 
+        if model is None:
+            model = 'test=file:/opt/ml/model/test/'
+
+        if cmd is None:
+            cmd = f'serve -m {self.container}'
+
         model_dir = os.path.join(os.getcwd(), 'models')
-        subprocess.run(
-            f'./launch_container.sh {self.image} {model_dir} {self.container} serve -m test=file:/opt/ml/model/test/'
+        return subprocess.run(
+            f'./launch_container.sh {self.image} {model_dir} {container} {cmd}'
             .split(),
-            check=True)
+            check=True, capture_output=True)
 
 
 class TestHfHandler:
@@ -424,3 +436,139 @@ def test_lora_llama3_8b(self):
             prepare.build_lmi_dist_model("llama3-8b-unmerged-lora")
             r.launch()
             client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split())
+
+
+class TestNeuronx1:
+    # Runs on inf2.24xl
+
+    def test_python_mode(self):
+        with Runner('pytorch-inf2', 'test_python_mode', download=True) as r:
+            r.launch(model='test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz', container='pytorch-inf2-1')
+            test_client.run()
+
+    def test_gpt2(self):
+        with Runner('pytorch-inf2', 'gpt2') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt2")
+            r.launch(cmd='serve -m pytorch-inf2-1')
+            client.run("transformers_neuronx gpt2".split())
+
+    def test_gpt2_quantize(self):
+        with Runner('pytorch-inf2', 'gpt2-quantize') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt2-quantize")
+            r.launch(cmd='serve -m pytorch-inf2-1')
+            client.run("transformers_neuronx gpt2-quantize".split())
+
+    def test_opt_1_3b(self):
+        with Runner('pytorch-inf2', 'opt-1.3b') as r:
+            prepare.build_transformers_neuronx_handler_model("opt-1.3b")
+            r.launch(cmd='serve -m pytorch-inf2-6')
+            client.run("transformers_neuronx opt-1.3b".split())
+
+    def test_gpt_j_6b(self):
+        with Runner('pytorch-inf2', 'gpt-j-6b') as r:
+            prepare.build_transformers_neuronx_handler_model("gpt-j-6b")
+            r.launch(cmd='serve -m pytorch-inf2-6')
+            client.run("transformers_neuronx gpt-j-6b".split())
+
+    def test_pythia(self):
+        with Runner('pytorch-inf2', 'pythia-2.8b') as r:
+            prepare.build_transformers_neuronx_handler_model("pythia-2.8b")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("transformers_neuronx pythia-2.8b".split())
+
+    def test_bloom(self):
+        with Runner('pytorch-inf2', 'bloom-7b1') as r:
+            prepare.build_transformers_neuronx_handler_model("bloom-7b1")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("transformers_neuronx bloom-7b1".split())
+
+    @pytest.mark.parametrize("model", ["gpt2", "gpt2-quantize"])
+    def test_partition(self, model):
+        with Runner('pytorch-inf2', f'partition-{model}') as r:
+            prepare.build_transformers_neuronx_handler_model(model)
+            with open("models/test/requirements.txt", "a") as f:
+                f.write("dummy_test")
+            partition_output = r.launch(cmd='partition --model-dir /opt/ml/input/data/training/ --skip-copy')
+
+            # Check if neff files are generated
+            if len([fn for fn in os.listdir("models/test/partition-test/compiled") if fn.endswith(".neff")]) == 0:
+                raise Exception("Failed to generate any .neff files")
+
+            # Check whether requirements.txt download is sufficient
+            if 'pip install requirements succeed!' not in partition_output.stdout.decode("utf-8"):
+                raise Exception("Requirements.txt not installed successfully")
+
+
+class TestNeuronx2:
+    # Runs on inf2.24xl
+
+    def test_stream_opt(self):
+        with Runner('pytorch-inf2', 'opt-1.3b-streaming') as r:
+            prepare.build_transformers_neuronx_handler_model("opt-1.3b-streaming")
+            r.launch(cmd='serve -m pytorch-inf2-6')
+            client.run("transformers_neuronx opt-1.3b-streaming".split())
+
+    def test_mistral(self):
+        with Runner('pytorch-inf2', 'mistral-7b') as r:
+            prepare.build_transformers_neuronx_handler_model("mistral-7b")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("transformers_neuronx mistral-7b".split())
+
+    def test_stable_diffusion_1_5(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-1.5-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model("stable-diffusion-1.5-neuron")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("neuron-stable-diffusion stable-diffusion-1.5-neuron".split())
+
+    def test_stable_diffusion_2_1(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-2.1-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model("stable-diffusion-2.1-neuron")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("neuron-stable-diffusion stable-diffusion-2.1-neuron".split())
+
+    def test_stable_diffusion_xl(self):
+        with Runner('pytorch-inf2', 'stable-diffusion-xl-neuron') as r:
+            prepare.build_transformers_neuronx_handler_model("stable-diffusion-xl-neuron")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("neuron-stable-diffusion stable-diffusion-xl-neuron".split())
+
+
+class TestNeuronxRollingBatch:
+    # Runs on inf2.24xl
+
+    def test_llama_7b(self):
+        with Runner('pytorch-inf2', 'llama-7b-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("llama-7b-rb")
+            r.launch(cmd='serve -m pytorch-inf2-2')
+            client.run("transformers_neuronx_rolling_batch llama-7b-rb".split())
+
+    def test_tiny_llama_vllm(self):
+        with Runner('pytorch-inf2', 'tiny-llama-rb-vllm') as r:
+            prepare.build_transformers_neuronx_handler_model("tiny-llama-rb-vllm")
+            r.launch(cmd='serve -m pytorch-inf2-1')
+            client.run("transformers_neuronx_rolling_batch tiny-llama-rb-vllm".split())
+
+    def test_llama3_vllm(self):
+        with Runner('pytorch-inf2', 'llama-3-8b-rb-vllm') as r:
+            prepare.build_transformers_neuronx_handler_model("llama-3-8b-rb-vllm")
+            r.launch(cmd='serve -m pytorch-inf2-4')
+            client.run("transformers_neuronx_rolling_batch llama-3-8b-rb-vllm".split())
+
+    def test_mixtral(self):
+        with Runner('pytorch-inf2', 'mixtral-8x7b-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("mixtral-8x7b-rb")
+            r.launch(cmd='serve -m pytorch-inf2-4')
+            client.run("transformers_neuronx_rolling_batch mixtral-8x7b-rb".split())
+
+    def test_llama_speculative(self):
+        with Runner('pytorch-inf2', 'llama-speculative-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("llama-speculative-rb")
+            r.launch(cmd='serve -m pytorch-inf2-6')
+            client.run("transformers_neuronx_rolling_batch llama-speculative-rb".split())
+
+    def test_llama_speculative_compiled(self):
+        with Runner('pytorch-inf2', 'llama-speculative-compiled-rb') as r:
+            prepare.build_transformers_neuronx_handler_model("llama-speculative-compiled-rb")
+            r.launch(cmd='serve -m pytorch-inf2-6')
+            client.run("transformers_neuronx_rolling_batch llama-speculative-compiled-rb".split())
+
diff --git a/tests/test_client.py b/tests/test_client.py
index f5da2480ff..70289959a0 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -67,5 +67,9 @@ def test_image(self):
         self.assertEqual(res.content, img.tobytes())
 
 
-if __name__ == '__main__':
+def run():
     unittest.main(verbosity=2)
+
+
+if __name__ == '__main__':
+    run()