[ci] Updating lmi-dist ci tests for rubikon-engine

deepjavalibrary · Mar 22, 2024 · c63e6de · c63e6de
1 parent cfae4c0
commit c63e6de
Show file tree

Hide file tree

Showing 8 changed files with 240 additions and 480 deletions.
diff --git a/.github/workflows/llm_integration_p4d.yml b/.github/workflows/llm_integration_p4d.yml
@@ -64,12 +64,12 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
           serve
           python3 llm/client.py lmi_dist_aiccl mixtral-8x7b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-            return 1
-          else
-            echo "Using aiccl backend"
-          fi
+#          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
+#            echo "aiccl backend not used"
+#            return 1
+#          else
+#            echo "Using aiccl backend"
+#          fi
           docker rm -f $(docker ps -aq)
       - name: Test Llama-2-70B with aiccl backend
         working-directory: tests/integration
@@ -79,12 +79,12 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
           serve
           python3 llm/client.py lmi_dist_aiccl llama-2-70b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-            return 1
-          else
-            echo "Using aiccl backend"
-          fi
+#          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
+#            echo "aiccl backend not used"
+#            return 1
+#          else
+#            echo "Using aiccl backend"
+#          fi
           docker rm -f $(docker ps -aq)
       - name: Test codellama/CodeLlama-34b-hf with aiccl backend
         working-directory: tests/integration
@@ -94,12 +94,12 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
           serve
           python3 llm/client.py lmi_dist_aiccl codellama-34b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-            return 1
-          else
-            echo "Using aiccl backend"
-          fi
+#          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
+#            echo "aiccl backend not used"
+#            return 1
+#          else
+#            echo "Using aiccl backend"
+#          fi
           docker rm -f $(docker ps -aq)
       - name: Test tiiuae/falcon-40b with aiccl backend
         working-directory: tests/integration
@@ -109,12 +109,12 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
           serve
           python3 llm/client.py lmi_dist_aiccl falcon-40b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-            return 1
-          else
-            echo "Using aiccl backend"
-          fi
+#          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
+#            echo "aiccl backend not used"
+#            return 1
+#          else
+#            echo "Using aiccl backend"
+#          fi
           docker rm -f $(docker ps -aq)
       - name: Remove models dir
         working-directory: tests/integration

diff --git a/.github/workflows/rolling_batch_integration.yml b/.github/workflows/rolling_batch_integration.yml
@@ -299,14 +299,32 @@ jobs:
             serve -m test=file:/opt/ml/model/test/
           python3 llm/client.py lmi_dist octocoder
           docker rm -f $(docker ps -aq)
-      - name: Test gpt-neox-20b-bitsandbytes
+      - name: Test speculative-llama-13b
         working-directory: tests/integration
         run: |
           rm -rf models
-          python3 llm/prepare.py lmi_dist gpt-neox-20b-bitsandbytes
+          python3 llm/prepare.py lmi_dist speculative-llama-13b
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
             serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist gpt-neox-20b-bitsandbytes
+          python3 llm/client.py lmi_dist speculative-llama-13b
+          docker rm -f $(docker ps -aq)
+      - name: Test starcoder2-7b
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          python3 llm/prepare.py lmi_dist starcoder2-7b
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
+            serve -m test=file:/opt/ml/model/test/
+          python3 llm/client.py lmi_dist starcoder2-7b
+          docker rm -f $(docker ps -aq)
+      - name: Test gemma-7b
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          python3 llm/prepare.py lmi_dist gemma-7b
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
+            serve -m test=file:/opt/ml/model/test/
+          python3 llm/client.py lmi_dist gemma-7b
           docker rm -f $(docker ps -aq)
       - name: Test llama2-13b-gptq
         working-directory: tests/integration
@@ -426,15 +444,6 @@ jobs:
             serve -m test=file:/opt/ml/model/test/
           python3 llm/client.py vllm phi-2
           docker rm -f $(docker ps -aq)
-      - name: Test Speculative Decoding with LLAMA 13B model
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm speculative-llama-13b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm speculative-llama-13b
-          docker rm -f $(docker ps -aq)
       - name: On fail step
         if: ${{ failure() }}
         working-directory: tests/integration

diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py
@@ -94,7 +94,7 @@ def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool,
         from djl_python.rolling_batch.scheduler_rolling_batch import SchedulerRollingBatch
         return SchedulerRollingBatch
     elif rolling_batch_type == "lmi-dist":
-        from djl_python.rolling_batch.lmi_dist_v2_rolling_batch import LmiDistRollingBatch
+        from djl_python.rolling_batch.lmi_dist_rolling_batch import LmiDistRollingBatch
         return LmiDistRollingBatch
     elif rolling_batch_type == "vllm":
         from djl_python.rolling_batch.vllm_rolling_batch import VLLMRollingBatch