diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md index 5a9df4f2..89696e81 100644 --- a/docs/examples/vllm/README.md +++ b/docs/examples/vllm/README.md @@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md) ## Deploy LeaderWorkerSet of vLLM -We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2). +We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 GPUs per pod (tensor_parallel_size=8). The leader pod runs the Ray head and the http server, while the workers run the Ray workers. ```shell @@ -67,7 +67,7 @@ Open another terminal and send a request curl http://localhost:8080/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "facebook/opt-125m", + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 @@ -80,11 +80,11 @@ The output should be similar to the following "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", "object": "text_completion", "created": 1715138766, - "model": "facebook/opt-125m", + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", "choices": [ { "index": 0, - "text": " great place to live. I", + "text": " top destination for foodies, with", "logprobs": null, "finish_reason": "length", "stop_reason": null diff --git a/docs/examples/vllm/build/Dockerfile b/docs/examples/vllm/build/Dockerfile index 34d6ecff..b292a929 100644 --- a/docs/examples/vllm/build/Dockerfile +++ b/docs/examples/vllm/build/Dockerfile @@ -1,2 +1,2 @@ -FROM docker.io/vllm/vllm-openai:v0.4.1 +FROM docker.io/vllm/vllm-openai:v0.5.3.post1 COPY ray_init.sh /vllm-workspace/ray_init.sh diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml index 9328f802..225773f9 100644 --- a/docs/examples/vllm/lws.yaml +++ b/docs/examples/vllm/lws.yaml @@ -14,25 +14,27 @@ spec: spec: containers: - name: vllm-leader - # this image is build with the Dockerfile under ./build - image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1 + image: env: - name: RAY_CLUSTER_SIZE valueFrom: fieldRef: fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size'] + - name: HUGGING_FACE_HUB_TOKEN + value: command: - sh - -c - "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2" + python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" resources: limits: - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi requests: - cpu: "4" - memory: 8Gi - nvidia.com/gpu: "1" + ephemeral-storage: 800Gi + cpu: 125 ports: - containerPort: 8080 readinessProbe: @@ -40,23 +42,31 @@ spec: port: 8080 initialDelaySeconds: 15 periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi workerTemplate: spec: containers: - name: vllm-worker - # this image is build with the Dockerfile under ./build - image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1 + image: command: - sh - -c - "/vllm-workspace/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local" resources: limits: - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi requests: - cpu: "4" - memory: 8Gi - nvidia.com/gpu: "1" + ephemeral-storage: 800Gi + cpu: 125 env: - name: LEADER_NAME valueFrom: @@ -70,3 +80,13 @@ spec: valueFrom: fieldRef: fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name'] + - name: HUGGING_FACE_HUB_TOKEN + value: + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi