From 85a7411d491dbdba6042d7c2d714ccdb21aa702a Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Tue, 8 Oct 2024 17:07:17 -0700
Subject: [PATCH] Add image build details and examples for multi-host inference
 (#278)

Add downstream details to support multi-host infer
---
 ...tration_v1alpha1_rayclusterreplicaset.yaml | 24 ++++++
 docs/tutorial/distributed/README.md           | 19 +++++
 docs/tutorial/distributed/fleet.yaml          | 74 +++++++++++++++++++
 3 files changed, 117 insertions(+)
 create mode 100644 docs/tutorial/distributed/fleet.yaml

diff --git a/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml b/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml
index 57abedce..f7499fe2 100644
--- a/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml
+++ b/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml
@@ -31,6 +31,8 @@ spec:
                     name: dashboard
                   - containerPort: 10001
                     name: client
+                  - containerPort: 8000
+                    name: service
                 resources:
                   limits:
                     cpu: 1
@@ -38,3 +40,25 @@ spec:
                   requests:
                     cpu: 1
                     memory: "1024Mi"
+      workerGroupSpecs:
+        - replicas: 1
+          minReplicas: 1
+          maxReplicas: 5
+          groupName: small-group
+          rayStartParams: {}
+          template:
+            spec:
+              containers:
+                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                  image: rayproject/ray:2.10.0
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command: [ "/bin/sh","-c","ray stop" ]
+                  resources:
+                    limits:
+                      cpu: 1
+                      memory: "1024Mi"
+                    requests:
+                      cpu: 1
+                      memory: "1024Mi"
diff --git a/docs/tutorial/distributed/README.md b/docs/tutorial/distributed/README.md
index e1f3952a..997812c4 100644
--- a/docs/tutorial/distributed/README.md
+++ b/docs/tutorial/distributed/README.md
@@ -1,5 +1,24 @@
 # Run vLLM Distributed Inference with Ray
 
+## Container Image
+
+> Note: some upstream work has not been merged yet. So we need to do some downstream changes
+
+```
+FROM vllm/vllm-openai:v0.6.2
+RUN apt update && apt install -y wget # important for future healthcheck
+RUN pip3 install ray[default] # important for future healthcheck
+COPY utils.py /usr/local/lib/python3.12/dist-packages/vllm/executor/ray_utils.py
+ENTRYPOINT [""]
+```
+
+> Note: copy uitls.py from upstream version and remove the placement group validation logic. See [#228](https://github.com/aibrix/aibrix/issues/228) for more details.
+> Note: No need to downgrade ray to v2.10.0. Seem only ray-project/ray image has issues.
+
+Container Image Combination which supports the distributed multi-host inference.
+- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/kuberay-operator:v1.2.1-patch
+- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.6.2-distributed
+
 ## Environment Setup
 
 ### Configure the GPU Cloud Instance
diff --git a/docs/tutorial/distributed/fleet.yaml b/docs/tutorial/distributed/fleet.yaml
new file mode 100644
index 00000000..70cb9ac4
--- /dev/null
+++ b/docs/tutorial/distributed/fleet.yaml
@@ -0,0 +1,74 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: RayClusterFleet
+metadata:
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  name: facebook-opt-13b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      models.aibricks.ai: facebook-opt-13b
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        models.aibricks.ai: facebook-opt-13b
+      annotations:
+          ray.io/overwrite-container-cmd: "true"
+    spec:
+      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      headGroupSpec:
+        rayStartParams:
+          dashboard-host: '0.0.0.0'
+        template:
+          spec:
+            containers:
+              - name: ray-head
+                image: rayproject/ray:2.10.0
+                ports:
+                  - containerPort: 6379
+                    name: gcs-server
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  - containerPort: 8000
+                    name: service
+                    command: ["/bin/bash", "-lc", "--"]
+                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                resources:
+                  limits:
+                    cpu: "1000m"
+                    nvidia.com/gpu: 1
+                  requests:
+                    cpu: "200m"
+                    nvidia.com/gpu: 1
+      workerGroupSpecs:
+        # the pod replicas in this group typed worker
+        - replicas: 1
+          minReplicas: 1
+          maxReplicas: 5
+          groupName: small-group
+          rayStartParams: {}
+          template:
+            spec:
+              containers:
+                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                  image: rayproject/ray:2.10.0
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command: [ "/bin/sh","-c","ray stop" ]
+                  resources:
+                    limits:
+                      cpu: "1000m"
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: "200m"
+                      nvidia.com/gpu: 1