[CI] restart docker service if general container kill doesn't work (#…

…1867)
deepjavalibrary · May 3, 2024 · 325d573 · 325d573
1 parent ca18e9d
commit 325d573
Showing 1 changed file with 14 additions and 34 deletions.
diff --git a/.github/workflows/llm_integration_p4d.yml b/.github/workflows/llm_integration_p4d.yml
@@ -31,7 +31,7 @@ jobs:
     outputs:
       p4d_instance_id: ${{ steps.create_gpu_p4d.outputs.action_lmic_p4d_instance_id }}
 
-  lmi-dist-aiccl-test:
+  lmi-dist-test:
     if: contains(fromJson('["", "aiccl"]'), github.event.inputs.run_test)
     runs-on: [ self-hosted, p4d ]
     timeout-minutes: 120
@@ -56,62 +56,42 @@ jobs:
         working-directory: tests/integration
         run: |
           docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test Mixtral-8x7B with aiccl backend
+      - name: Test Mixtral-8x7B
         working-directory: tests/integration
         run: |
           rm -rf models
           python3 llm/prepare.py lmi_dist_aiccl mixtral-8x7b-aiccl
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py lmi_dist_aiccl mixtral-8x7b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-          else
-            echo "Using aiccl backend"
-          fi
-          docker rm -f $(docker ps -aq)
-      - name: Test Llama-2-70B with aiccl backend
+          docker rm -f $(docker ps -aq) || systemctl restart docker
+      - name: Test Llama-2-70B
         working-directory: tests/integration
         run: |
           rm -rf models
           python3 llm/prepare.py lmi_dist_aiccl llama-2-70b-aiccl
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py lmi_dist_aiccl llama-2-70b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-          else
-            echo "Using aiccl backend"
-          fi
-          docker rm -f $(docker ps -aq)
-      - name: Test codellama/CodeLlama-34b-hf with aiccl backend
+          docker rm -f $(docker ps -aq) || systemctl restart docker
+      - name: Test codellama/CodeLlama-34b-hf
         working-directory: tests/integration
         run: |
           rm -rf models
           python3 llm/prepare.py lmi_dist_aiccl codellama-34b-aiccl
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py lmi_dist_aiccl codellama-34b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-          else
-            echo "Using aiccl backend"
-          fi
-          docker rm -f $(docker ps -aq)
-      - name: Test tiiuae/falcon-40b with aiccl backend
+          docker rm -f $(docker ps -aq) || systemctl restart docker
+      - name: Test tiiuae/falcon-40b
         working-directory: tests/integration
         run: |
           rm -rf models
           python3 llm/prepare.py lmi_dist_aiccl falcon-40b-aiccl
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py lmi_dist_aiccl falcon-40b-aiccl
-          if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
-            echo "aiccl backend not used"
-          else
-            echo "Using aiccl backend"
-          fi
-          docker rm -f $(docker ps -aq)
+          docker rm -f $(docker ps -aq) || systemctl restart docker
       - name: Remove models dir
         working-directory: tests/integration
         run: |
@@ -161,7 +141,7 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
           serve
           python3 llm/client.py trtllm llama2-70b
-          docker rm -f $(docker ps -aq)
+          docker rm -f $(docker ps -aq) || systemctl restart docker
       - name: Test mixtral-8x7b with with TP8
         working-directory: tests/integration
         run: |
@@ -170,7 +150,7 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
           serve
           python3 llm/client.py trtllm mixtral-8x7b
-          docker rm -f $(docker ps -aq)
+          docker rm -f $(docker ps -aq) || systemctl restart docker
       - name: Remove models dir
         working-directory: tests/integration
         run: |
@@ -220,7 +200,7 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py vllm llama2-70b
-          docker rm -f $(docker ps -aq)
+          docker rm -f $(docker ps -aq) || systemctl restart docker
       - name: Test mixtral-8x7b with with TP8
         working-directory: tests/integration
         run: |
@@ -229,7 +209,7 @@ jobs:
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
           serve
           python3 llm/client.py vllm mixtral-8x7b
-          docker rm -f $(docker ps -aq)
+          docker rm -f $(docker ps -aq) || systemctl restart docker
       - name: Remove models dir
         working-directory: tests/integration
         run: |
@@ -250,7 +230,7 @@ jobs:
   stop-runners-p4d:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners-p4d, lmi-dist-aiccl-test, trtllm-test, vllm-test ]
+    needs: [ create-runners-p4d, lmi-dist-test, trtllm-test, vllm-test ]
     steps:
       - name: Stop all instances
         run: |