From 85a7411d491dbdba6042d7c2d714ccdb21aa702a Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Tue, 8 Oct 2024 17:07:17 -0700 Subject: [PATCH] Add image build details and examples for multi-host inference (#278) Add downstream details to support multi-host infer --- ...tration_v1alpha1_rayclusterreplicaset.yaml | 24 ++++++ docs/tutorial/distributed/README.md | 19 +++++ docs/tutorial/distributed/fleet.yaml | 74 +++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 docs/tutorial/distributed/fleet.yaml diff --git a/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml b/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml index 57abedce..f7499fe2 100644 --- a/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml +++ b/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml @@ -31,6 +31,8 @@ spec: name: dashboard - containerPort: 10001 name: client + - containerPort: 8000 + name: service resources: limits: cpu: 1 @@ -38,3 +40,25 @@ spec: requests: cpu: 1 memory: "1024Mi" + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + groupName: small-group + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: rayproject/ray:2.10.0 + lifecycle: + preStop: + exec: + command: [ "/bin/sh","-c","ray stop" ] + resources: + limits: + cpu: 1 + memory: "1024Mi" + requests: + cpu: 1 + memory: "1024Mi" diff --git a/docs/tutorial/distributed/README.md b/docs/tutorial/distributed/README.md index e1f3952a..997812c4 100644 --- a/docs/tutorial/distributed/README.md +++ b/docs/tutorial/distributed/README.md @@ -1,5 +1,24 @@ # Run vLLM Distributed Inference with Ray +## Container Image + +> Note: some upstream work has not been merged yet. So we need to do some downstream changes + +``` +FROM vllm/vllm-openai:v0.6.2 +RUN apt update && apt install -y wget # important for future healthcheck +RUN pip3 install ray[default] # important for future healthcheck +COPY utils.py /usr/local/lib/python3.12/dist-packages/vllm/executor/ray_utils.py +ENTRYPOINT [""] +``` + +> Note: copy uitls.py from upstream version and remove the placement group validation logic. See [#228](https://github.com/aibrix/aibrix/issues/228) for more details. +> Note: No need to downgrade ray to v2.10.0. Seem only ray-project/ray image has issues. + +Container Image Combination which supports the distributed multi-host inference. +- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/kuberay-operator:v1.2.1-patch +- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.6.2-distributed + ## Environment Setup ### Configure the GPU Cloud Instance diff --git a/docs/tutorial/distributed/fleet.yaml b/docs/tutorial/distributed/fleet.yaml new file mode 100644 index 00000000..70cb9ac4 --- /dev/null +++ b/docs/tutorial/distributed/fleet.yaml @@ -0,0 +1,74 @@ +apiVersion: orchestration.aibrix.ai/v1alpha1 +kind: RayClusterFleet +metadata: + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + name: facebook-opt-13b +spec: + replicas: 1 + selector: + matchLabels: + models.aibricks.ai: facebook-opt-13b + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + labels: + models.aibricks.ai: facebook-opt-13b + annotations: + ray.io/overwrite-container-cmd: "true" + spec: + rayVersion: '2.10.0' # should match the Ray version in the image of the containers + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.10.0 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: service + command: ["/bin/bash", "-lc", "--"] + args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray] + resources: + limits: + cpu: "1000m" + nvidia.com/gpu: 1 + requests: + cpu: "200m" + nvidia.com/gpu: 1 + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + groupName: small-group + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: rayproject/ray:2.10.0 + lifecycle: + preStop: + exec: + command: [ "/bin/sh","-c","ray stop" ] + resources: + limits: + cpu: "1000m" + nvidia.com/gpu: 1 + requests: + cpu: "200m" + nvidia.com/gpu: 1