diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst index 7fc53ca3..39327514 100644 --- a/docs/source/features/distributed-kv-cache.rst +++ b/docs/source/features/distributed-kv-cache.rst @@ -29,9 +29,9 @@ After deployment, we can see all the components by using ``kubectl get pods -n a .. code-block:: RST - NAME READY STATUS RESTARTS AGE - aibrix-model-deepseek-coder-7b-kvcache-596965997-p86cx 1/1 Running 0 2m - aibrix-model-deepseek-coder-7b-kvcache-etcd-0 1/1 Running 0 2m + NAME READY STATUS RESTARTS AGE + deepseek-coder-7b-kvcache-596965997-p86cx 1/1 Running 0 2m + deepseek-coder-7b-kvcache-etcd-0 1/1 Running 0 2m After all components are running, we can use the following yaml to deploy the inference service: @@ -49,9 +49,8 @@ Now let's use ``kubectl get pods`` command to ensure the inference service is ru .. code-block:: RST - NAME READY STATUS RESTARTS AGE - download-model 1/1 Running 0 12m - aibrix-model-deepseek-coder-7b-instruct-6b885ffd8b-2kfjv 2/2 Running 0 4m + NAME READY STATUS RESTARTS AGE + deepseek-coder-7b-instruct-6b885ffd8b-2kfjv 2/2 Running 0 4m After launching AIBrix's deployment, we can use the following yaml to deploy a distributed KV cache cluster: @@ -65,6 +64,16 @@ After launching AIBrix's deployment, we can use the following yaml to deploy a d 2. ``kvcache.orchestration.aibrix.ai/node-affinity-gpu-type`` is unnecessary unless you deploy the model across different GPUs. +Run ``kubectl get pods`` to verify all pods are running. + +.. note:: + kubectl get pods -o wide + NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES + deepseek-coder-7b-instruct-85664648c7-xgp9h 1/1 Running 0 2m41s 192.168.59.224 ip-192-168-41-184.us-west-2.compute.internal + deepseek-coder-7b-kvcache-7d5896cd89-dcfzt 1/1 Running 0 2m31s 192.168.37.154 ip-192-168-41-184.us-west-2.compute.internal + deepseek-coder-7b-kvcache-etcd-0 1/1 Running 0 2m31s 192.168.19.197 ip-192-168-3-183.us-west-2.compute.internal + + Once the inference service is running, let's set up port forwarding so that we can test the service from local: * Run ``kubectl get svc -n envoy-gateway-system`` to get the name of the Envoy Gateway service. diff --git a/docs/source/features/multi-node-inference.rst b/docs/source/features/multi-node-inference.rst index 27c41e19..f740a16a 100644 --- a/docs/source/features/multi-node-inference.rst +++ b/docs/source/features/multi-node-inference.rst @@ -53,7 +53,7 @@ Workloads Examples This is the ``RayClusterFleet`` example, you can apply this yaml in your cluster. -.. literalinclude:: ../../../samples/distributed/fleet.yaml +.. literalinclude:: ../../../samples/distributed/fleet-two-node.yaml :language: yaml diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index d403b455..ebf52f7c 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -59,6 +59,11 @@ Depending on where you deployed the AIBrix, you can use either of the following kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 & ENDPOINT="localhost:8888" +.. attention:: + + Some cloud provider like AWS EKS expose the endpoint at hostname field, if that case, you should use ``.status.loadBalancer.ingress[0].hostname`` instead. + + .. code-block:: bash # completion api diff --git a/samples/distributed/fleet-two-node.yaml b/samples/distributed/fleet-two-node.yaml new file mode 100644 index 00000000..850eea4c --- /dev/null +++ b/samples/distributed/fleet-two-node.yaml @@ -0,0 +1,74 @@ +apiVersion: orchestration.aibrix.ai/v1alpha1 +kind: RayClusterFleet +metadata: + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + name: facebook-opt-13b +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: facebook-opt-13b + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + labels: + model.aibrix.ai/name: facebook-opt-13b + annotations: + ray.io/overwrite-container-cmd: "true" + spec: + rayVersion: '2.10.0' # should match the Ray version in the image of the containers + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: vllm/vllm-openai:v0.7.1 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: service + command: ["/bin/bash", "-lc", "--"] + args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray"] + resources: + limits: + cpu: "4" + nvidia.com/gpu: 1 + requests: + cpu: "4" + nvidia.com/gpu: 1 + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + groupName: small-group + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: vllm/vllm-openai:v0.7.1 + lifecycle: + preStop: + exec: + command: [ "/bin/sh","-c","ray stop" ] + resources: + limits: + cpu: "4" + nvidia.com/gpu: 1 + requests: + cpu: "4" + nvidia.com/gpu: 1 diff --git a/samples/distributed/fleet.yaml b/samples/distributed/fleet.yaml index 2b182801..3bf95c1a 100644 --- a/samples/distributed/fleet.yaml +++ b/samples/distributed/fleet.yaml @@ -4,12 +4,12 @@ metadata: labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize - name: facebook-opt-13b + name: qwen-coder-7b-instruct spec: replicas: 1 selector: matchLabels: - model.aibrix.ai/name: facebook-opt-13b + model.aibrix.ai/name: qwen-coder-7b-instruct strategy: rollingUpdate: maxSurge: 25% @@ -18,7 +18,7 @@ spec: template: metadata: labels: - model.aibrix.ai/name: facebook-opt-13b + model.aibrix.ai/name: qwen-coder-7b-instruct annotations: ray.io/overwrite-container-cmd: "true" spec: @@ -44,7 +44,7 @@ spec: # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD` # into the Ray container. This variable can be used to retrieve the generated Ray start command. # Note that this environment variable does not include the `ulimit` command. - args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"] + args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --served-model-name qwen-coder-7b-instruct --tensor-parallel-size 2 --distributed-executor-backend ray"] resources: limits: cpu: "8000m" diff --git a/samples/kvcache/deployment-tp.yaml b/samples/kvcache/deployment-tp.yaml index 902a9531..2a391999 100644 --- a/samples/kvcache/deployment-tp.yaml +++ b/samples/kvcache/deployment-tp.yaml @@ -1,17 +1,24 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: aibrix-model-deepseek-coder-33b-instruct + name: deepseek-coder-33b-instruct labels: model.aibrix.ai/name: deepseek-coder-33b-instruct model.aibrix.ai/port: "8000" spec: + replicas: 1 strategy: rollingUpdate: maxSurge: 1 maxUnavailable: 1 type: RollingUpdate + selector: + matchLabels: + model.aibrix.ai/name: deepseek-coder-33b-instruct template: + metadata: + labels: + model.aibrix.ai/name: deepseek-coder-33b-instruct spec: containers: - name: vllm-openai @@ -58,7 +65,12 @@ spec: volumeMounts: - mountPath: /var/run name: kvcache-socket + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" volumes: - name: kvcache-socket hostPath: - path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache-rpc \ No newline at end of file + path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache \ No newline at end of file diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml index 6d847208..ff752e61 100644 --- a/samples/kvcache/deployment.yaml +++ b/samples/kvcache/deployment.yaml @@ -1,17 +1,24 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: aibrix-model-deepseek-coder-7b-instruct + name: deepseek-coder-7b-instruct labels: model.aibrix.ai/name: deepseek-coder-7b-instruct model.aibrix.ai/port: "8000" spec: + replicas: 1 strategy: rollingUpdate: maxSurge: 1 maxUnavailable: 1 type: RollingUpdate + selector: + matchLabels: + model.aibrix.ai/name: deepseek-coder-7b-instruct template: + metadata: + labels: + model.aibrix.ai/name: deepseek-coder-7b-instruct spec: containers: - name: vllm-openai @@ -26,9 +33,9 @@ spec: - --model - deepseek-ai/deepseek-coder-6.7b-instruct - --served-model-name - - deepseek-coder-6.7b-instruct + - deepseek-coder-7b-instruct - --max-model-len - - "17000" + - "12288" - --enable-prefix-caching - --disable-fastapi-docs env: @@ -53,7 +60,12 @@ spec: volumeMounts: - mountPath: /var/run name: kvcache-socket + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" volumes: - name: kvcache-socket hostPath: - path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-7b-kvcache-rpc + path: /var/run/vineyard-kubernetes/default/deepseek-coder-7b-kvcache diff --git a/samples/kvcache/kvcache-tp.yaml b/samples/kvcache/kvcache-tp.yaml index eb10070c..d69eb674 100644 --- a/samples/kvcache/kvcache-tp.yaml +++ b/samples/kvcache/kvcache-tp.yaml @@ -1,7 +1,7 @@ apiVersion: orchestration.aibrix.ai/v1alpha1 kind: KVCache metadata: - name: aibrix-model-deepseek-coder-33b-kvcache + name: deepseek-coder-33b-kvcache namespace: default annotations: # kvcache.orchestration.aibrix.ai/node-affinity-gpu-type: NVIDIA-L20 @@ -11,6 +11,6 @@ spec: service: type: ClusterIP port: 9600 - cache: + cacheSpec: image: aibrix/vineyardd:20241120 imagePullPolicy: IfNotPresent diff --git a/samples/kvcache/kvcache.yaml b/samples/kvcache/kvcache.yaml index d674c137..fb178195 100644 --- a/samples/kvcache/kvcache.yaml +++ b/samples/kvcache/kvcache.yaml @@ -1,15 +1,15 @@ apiVersion: orchestration.aibrix.ai/v1alpha1 kind: KVCache metadata: - name: aibrix-model-deepseek-coder-7b-kvcache + name: deepseek-coder-7b-kvcache namespace: default annotations: - kvcache.orchestration.aibrix.ai/pod-affinity-workload: aibrix-model-deepseek-coder-7b-instruct + kvcache.orchestration.aibrix.ai/pod-affinity-workload: deepseek-coder-7b-instruct spec: replicas: 1 service: type: ClusterIP port: 9600 - cache: + cacheSpec: image: aibrix/vineyardd:20241120 imagePullPolicy: IfNotPresent diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml index a94ca2d7..39eb5133 100644 --- a/samples/quickstart/model.yaml +++ b/samples/quickstart/model.yaml @@ -30,6 +30,8 @@ spec: - --served-model-name # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name` - deepseek-r1-distill-llama-8b + - --max-model-len + - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue. image: vllm/vllm-openai:v0.7.1 imagePullPolicy: Always name: vllm-openai