[tnx] refactor neuron testing

deepjavalibrary · May 24, 2024 · 921b71d · 921b71d
1 parent 030a745
commit 921b71d
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 8 deletions.
diff --git a/.github/workflows/llm_inf2_integration.yml b/.github/workflows/llm_inf2_integration.yml
@@ -37,6 +37,53 @@ jobs:
       inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
       inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}
 
+  transformers-neuronx-container-unit-tests:
+    runs-on: [ self-hosted, inf2 ]
+    timeout-minutes: 15
+    needs: create-runners
+    steps:
+      - uses: actions/checkout@v4
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Set up Python3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy pillow
+      - name: Build container name
+        run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
+      - name: Download models and dockers
+        run: |
+          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
+      - name: Run djl_python unit/integration tests on container
+        run: |
+          mkdir logs
+          docker run -t --rm --network="host" \
+          --name neuron-test \
+          -v $PWD/:/opt/ml/model/ \
+          -w /opt/ml/model \
+          --device=/dev/neuron0:/dev/neuron0 \
+          deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \
+          /bin/bash -c "'pip install /opt/ml/model/engines/python/setup/. pytest' && \
+          pytest engines/python/setup/djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
+          
+          sudo rm -rf TinyLlama
+          if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
+      - name: On fail step
+        if: ${{ failure() }}
+        run: |
+          cat logs/results.log
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: transformers-neuronx-${{ matrix.arch }}-logs
+          path: logs/
+
   transformers-neuronx-test-1:
     runs-on: [ self-hosted, inf2 ]
     timeout-minutes: 90
@@ -162,6 +209,7 @@ jobs:
           # checking whether requirements.txt download is successful
           if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
           then echo "requirements.txt install was successful"; else exit 1; fi
+          if [ -d models ]; then sudo rm -rf models; fi
       - name: Test gpt2-quantize partition
         working-directory: tests/integration
         run: |
@@ -181,10 +229,12 @@ jobs:
           # checking whether requirements.txt download is successful
           if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
           then echo "requirements.txt install was successful"; else exit 1; fi
+          if [ -d models ]; then sudo rm -rf models; fi
       - name: On fail step
         if: ${{ failure() }}
         working-directory: tests/integration
         run: |
+          if [ -d models ]; then sudo rm -rf models; fi
           cat logs/serving.log
       - name: Upload test logs
         uses: actions/upload-artifact@v3
@@ -277,6 +327,7 @@ jobs:
         if: ${{ failure() }}
         working-directory: tests/integration
         run: |
+          if [ -d models ]; then sudo rm -rf models; fi
           cat logs/serving.log
       - name: Upload test logs
         uses: actions/upload-artifact@v3
@@ -382,6 +433,7 @@ jobs:
         if: ${{ failure() }}
         working-directory: tests/integration
         run: |
+          if [ -d models ]; then sudo rm -rf models; fi
           cat logs/serving.log
       - name: Upload test logs
         uses: actions/upload-artifact@v3
@@ -392,7 +444,7 @@ jobs:
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, transformers-neuronx-test-1, transformers-neuronx-test-2, transformers-neuronx-rolling-batch ]
+    needs: [ create-runners, transformers-neuronx-container-unit-tests, transformers-neuronx-test-1, transformers-neuronx-test-2, transformers-neuronx-rolling-batch ]
     steps:
       - name: Stop all instances
         run: |

diff --git a/engines/python/setup/djl_python/properties_manager/tnx_properties.py b/engines/python/setup/djl_python/properties_manager/tnx_properties.py
@@ -150,8 +150,8 @@ def validate_rolling_batch(cls, rolling_batch: str) -> str:
         if rolling_batch not in TNX_SUPPORTED_ROLLING_BATCH_TYPES:
             logging.warning(
                 f"transformer neuronx only supports "
-                f"rolling batch type {TNX_SUPPORTED_ROLLING_BATCH_TYPES}."
-                f"choosing neuronx rolling batch automatically.")
+                f"rolling batch type {TNX_SUPPORTED_ROLLING_BATCH_TYPES} "
+                f"choosing 'tnx' rolling batch automatically.")
             return 'auto'
         return rolling_batch
 

diff --git a/engines/python/setup/djl_python/tests/neuron_test_scripts/test_neuron_vllm_rolling_batch.py b/engines/python/setup/djl_python/tests/neuron_test_scripts/test_neuron_vllm_rolling_batch.py
@@ -21,7 +21,6 @@
 try:
     from djl_python.properties_manager.tnx_properties import TransformerNeuronXProperties
     from djl_python.rolling_batch.vllm_rolling_batch import VLLMRollingBatch
-    from djl_python.tests.rolling_batch_test_scripts.generator import Generator
     SKIP_TEST = False
 except ImportError:
     SKIP_TEST = True
@@ -46,10 +45,11 @@ class TestNeuronVLLM(unittest.TestCase):
     def test_models(self):
         # === Preparation ===
         script_directory = os.path.dirname(os.path.abspath(__file__))
-        relative_path = "../../../"
+        relative_path = "../rolling_batch_test_scripts"
         new_path = os.path.normpath(
             os.path.join(script_directory, relative_path))
         sys.path.append(new_path)
+        from djl_python.tests.rolling_batch_test_scripts.generator import Generator
 
         # --- Models ---
         model_names = [
@@ -148,5 +148,4 @@ def test_models(self):
 
 
 if __name__ == '__main__':
-    c = TestNeuronVLLM()
-    c.test_models()
+    unittest.main()
diff --git a/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py b/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py
@@ -152,7 +152,7 @@ def test_set_rolling_batch(self, params):
 
         # Test
         with self.patch_neuron_rolling_batch() as mock_rolling_batch:
-            self.service.set_rolling_batch()
+            self.service.set_rolling_batch(test_properties)
 
         # Evaluate
         self.assertEqual(mock_rolling_batch.called, expected)