Skip to content

Commit

Permalink
Merge pull request #185 from Edwinhr716/vllm-docs-update
Browse files Browse the repository at this point in the history
updating vllm docs to use llama3  405B as example
  • Loading branch information
k8s-ci-robot authored Aug 6, 2024
2 parents 064c64e + a3b7864 commit 0f9fa08
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 18 deletions.
8 changes: 4 additions & 4 deletions docs/examples/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s
Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md)

## Deploy LeaderWorkerSet of vLLM
We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2).
We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 GPUs per pod (tensor_parallel_size=8).
The leader pod runs the Ray head and the http server, while the workers run the Ray workers.

```shell
Expand Down Expand Up @@ -67,7 +67,7 @@ Open another terminal and send a request
curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "facebook/opt-125m",
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
Expand All @@ -80,11 +80,11 @@ The output should be similar to the following
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"object": "text_completion",
"created": 1715138766,
"model": "facebook/opt-125m",
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{
"index": 0,
"text": " great place to live. I",
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/vllm/build/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
FROM docker.io/vllm/vllm-openai:v0.4.1
FROM docker.io/vllm/vllm-openai:v0.5.3.post1
COPY ray_init.sh /vllm-workspace/ray_init.sh
46 changes: 33 additions & 13 deletions docs/examples/vllm/lws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,49 +14,59 @@ spec:
spec:
containers:
- name: vllm-leader
# this image is build with the Dockerfile under ./build
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
image: <image-built-from-dockerfile>
env:
- name: RAY_CLUSTER_SIZE
valueFrom:
fieldRef:
fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size']
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
command:
- sh
- -c
- "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE;
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2"
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
resources:
limits:
nvidia.com/gpu: "1"
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
ephemeral-storage: 800Gi
cpu: 125
ports:
- containerPort: 8080
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
workerTemplate:
spec:
containers:
- name: vllm-worker
# this image is build with the Dockerfile under ./build
image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
image: <image-built-from-dockerfile>
command:
- sh
- -c
- "/vllm-workspace/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local"
resources:
limits:
nvidia.com/gpu: "1"
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
ephemeral-storage: 800Gi
cpu: 125
env:
- name: LEADER_NAME
valueFrom:
Expand All @@ -70,3 +80,13 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi

0 comments on commit 0f9fa08

Please sign in to comment.