1) update README.md 2) update the leader pod commands 3) remove names…

…pace in service.yaml 4) update subcommand from head to leader Signed-off-by: zibai <zibai.gj@alibaba-inc.com>
kubernetes-sigs · May 9, 2024 · b3a8023 · b3a8023
1 parent d2d75b5
commit b3a8023
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 17 deletions.
diff --git a/docs/example/vllm/README.md b/docs/example/vllm/README.md
@@ -1,21 +1,21 @@
-# Deploy distributed vLLM with LWS on GPUs
+# Deploy Distributed Inference Service with vLLM and LWS on GPUs
 
-In this example, we will use LeaderWorkerSet to deploy a distributed inference instance with vLLM on GPUs.    
+In this example, we will use LeaderWorkerSet to deploy a distributed inference service with vLLM on GPUs.    
 [vLLM](https://docs.vllm.ai/en/latest/index.html) supports distributed tensor-parallel inference and serving. Currently, it supports Megatron-LM’s tensor parallel algorithm. It manages the distributed runtime with [Ray](https://docs.ray.io/en/latest/index.html). See the doc [vLLM Distributed Inference and Serving](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) for more details.
 
 ## Install LeaderWorkerSet
 
 Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md)
 
-## Deploy LeaderWorkerSet Deployment
-We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replicas has two pods (tp=2). 
+## Deploy LeaderWorkerSet of vLLM
+We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2). 
 The leader pod runs the Ray head and the http server, while the workers run the Ray workers.
 
 ```shell
 kubectl apply -f lws.yaml
 ```
 
-Verify the status of the vLLM Deployment
+Verify the status of the vLLM pods
 ```shell
 kubectl get pods
 ```
@@ -38,7 +38,7 @@ Should get an output similar to this
 INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
 (RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
 ```
-The total weights of the facebook/opt-125m model is about 0.25GB, and each pod loads 0.1189GB.
+
 
 # Use vLLM
 

diff --git a/docs/example/vllm/build/ray_init.sh b/docs/example/vllm/build/ray_init.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -e
+
 subcommand=$1
 shift
 
@@ -17,6 +17,9 @@ case "$subcommand" in
         --ray_port=*)
           ray_port="${1#*=}"
           ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
         *)
           echo "unknown argument: $1"
           exit 1
@@ -35,14 +38,14 @@ case "$subcommand" in
         echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
         exit 0
       fi
-      echo "Wait the ray worker to be active..."
+      echo "Waiting until the ray worker is active..."
       sleep 5s;
     done
     echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
     exit 1
     ;;
 
-  head)
+  leader)
     ray_cluster_size=""
     while [ $# -gt 0 ]; do
           case "$1" in
@@ -67,8 +70,10 @@ case "$subcommand" in
       exit 1
     fi
 
+    # start the ray daemon
     ray start --head --port=$ray_port
-    # wait all workers to be active
+
+    # wait until all workers are active
     for (( i=0; i < $ray_init_timeout; i+=5 )); do
         active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
         if [ $active_nodes -eq $ray_cluster_size ]; then
@@ -79,7 +84,7 @@ case "$subcommand" in
         sleep 5s;
     done
 
-    echo "Waiting for all ray workers to be active has been timed out."
+    echo "Waiting for all ray workers to be active timed out."
     exit 1
     ;;
 
@@ -89,4 +94,3 @@ case "$subcommand" in
     ;;
 esac
 
-
diff --git a/docs/example/vllm/lws.yaml b/docs/example/vllm/lws.yaml
@@ -14,6 +14,7 @@ spec:
       spec:
         containers:
           - name: vllm-leader
+            # this image is build with the Dockerfile under ./build
             image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
             env:
               - name: RAY_CLUSTER_SIZE
@@ -23,8 +24,8 @@ spec:
             command:
               - sh
               - -c
-              - "/vllm-workspace/ray_init.sh head --ray_cluster_size=$RAY_CLUSTER_SIZE; 
-                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --gpu-memory-utilization 0.95 --tensor-parallel-size 2"
+              - "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2"
             resources:
               limits:
                 nvidia.com/gpu: "1"
@@ -43,6 +44,7 @@ spec:
       spec:
         containers:
           - name: vllm-worker
+            # this image is build with the Dockerfile under ./build
             image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
             command:
               - sh
@@ -67,4 +69,4 @@ spec:
               - name: LWS_NAME
                 valueFrom:
                   fieldRef:
-                    fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
+                    fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
diff --git a/docs/example/vllm/service.yaml b/docs/example/vllm/service.yaml
@@ -2,7 +2,6 @@ apiVersion: v1
 kind: Service
 metadata:
   name: vllm-leader
-  namespace: default
 spec:
   ports:
     - name: http
@@ -12,4 +11,4 @@ spec:
   selector:
     leaderworkerset.sigs.k8s.io/name: vllm
     role: leader
-  type: ClusterIP
+  type: ClusterIP