vllm-project · Jeffwan · Mar 3, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.github/workflows/docker-build-images.yml b/.github/workflows/docker-build-images.yml
@@ -2,7 +2,7 @@ name: Docker Build Images
 
 on:
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "release-*" ]
 
 jobs:
   build:

diff --git a/.github/workflows/docker-push-images.yml b/.github/workflows/docker-push-images.yml
@@ -2,7 +2,7 @@ name: Docker Push Images
 
 on:
   push:
-    branches: [ "main", "release-*"]
+    branches: [ "main", "release-*" ]
 
 jobs:
   build:

diff --git a/.github/workflows/installation-tests.yml b/.github/workflows/installation-tests.yml
@@ -3,7 +3,7 @@ name: Installation Tests
 on:
   workflow_dispatch:  # Allows manual trigger
   push:
-    branches: [ "main" ]
+    branches: [ "main", "release-*" ]
     paths:
       - '.github/workflows/**'
       - 'build/container/**' 

diff --git a/.github/workflows/lint-and-tests.yml b/.github/workflows/lint-and-tests.yml
@@ -3,7 +3,7 @@ name: Linter and Unit Tests
 on:
   workflow_dispatch:  # Allows manual trigger
   push:
-    branches: [ "main" ]
+    branches: [ "main", "release-*" ]
     paths:
       - 'pkg/**'
       - 'cmd/**'

diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -2,7 +2,7 @@ name: Python Tests
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "main", "release-*" ]
     paths:
       - 'python/**'
   pull_request:

diff --git a/config/dependency/kuberay-operator/README.md b/config/dependency/kuberay-operator/README.md
@@ -3,7 +3,7 @@
 Commands to export manifest from helm package. After you got manifest, copy to this folder.
 
 ```shell
-helm template kuberay-operator kuberay/kuberay-operator --namespace aibrix-system --version 1.2.1 --include-crds --set env[0].name=ENABLE_PROBES_INJECTION --set env[0].value=\"false\" --set fullnameOverride=kuberay-operator --set featureGates[0].name=RayClusterStatusConditions --set featureGates[0].enabled=true --output-dir ./config/dependency
+helm template kuberay-operator kuberay/kuberay-operator --namespace aibrix-system --version 1.2.1 --include-crds --set env[0].name=ENABLE_PROBES_INJECTION --set-string env[0].value=false --set fullnameOverride=kuberay-operator --set featureGates[0].name=RayClusterStatusConditions --set featureGates[0].enabled=true --output-dir ./config/dependency
 ```
 
 If you use zsh, please use `noglob helm ...` to skip the brace check.
diff --git a/config/dependency/kuberay-operator/templates/deployment.yaml b/config/dependency/kuberay-operator/templates/deployment.yaml
@@ -51,7 +51,7 @@ spec:
               protocol: TCP
           env:
             - name: ENABLE_PROBES_INJECTION
-              value: '"false"'
+              value: "false"
           livenessProbe:
             httpGet:
               path: /metrics

diff --git a/development/app/requirements.txt b/development/app/requirements.txt
@@ -8,4 +8,5 @@ ddsketch
 plotly_express
 fasteners
 transformers
-git+https://github.com/zhangjyr/vidur.git
+git+https://github.com/zhangjyr/vidur.git
+ray[default]
diff --git a/development/tutorials/distributed/fleet.yaml b/development/tutorials/distributed/fleet.yaml
@@ -3,7 +3,7 @@ kind: RayClusterFleet
 metadata:
   labels:
     app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
+    model.aibrix.ai/name: facebook-opt-13b
   name: facebook-opt-13b
 spec:
   replicas: 1
@@ -20,13 +20,17 @@ spec:
       labels:
         model.aibrix.ai/name: facebook-opt-13b
       annotations:
-          ray.io/overwrite-container-cmd: "true"
+        ray.io/overwrite-container-cmd: "true"
     spec:
-      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      rayVersion: "2.10.0"
       headGroupSpec:
         rayStartParams:
           dashboard-host: '0.0.0.0'
+          block: 'false'
         template:
+          metadata:
+            labels:
+              model.aibrix.ai/name: facebook-opt-13b
           spec:
             containers:
               - name: ray-head
@@ -40,35 +44,35 @@ spec:
                     name: client
                   - containerPort: 8000
                     name: service
-                    command: ["/bin/bash", "-lc", "--"]
-                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                command: ["/bin/bash", "-lc", "--"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; sleep 600"]
                 resources:
                   limits:
-                    cpu: "1000m"
-                    nvidia.com/gpu: 1
+                    cpu: 1000m
                   requests:
-                    cpu: "200m"
-                    nvidia.com/gpu: 1
+                    cpu: 200m
       workerGroupSpecs:
-        # the pod replicas in this group typed worker
-        - replicas: 1
+        - replicas: 2
           minReplicas: 1
           maxReplicas: 5
           groupName: small-group
           rayStartParams: {}
           template:
+            metadata:
+              labels:
+                model.aibrix.ai/name: facebook-opt-13b
             spec:
               containers:
-                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.10.0
+                - name: ray-worker
+                  image: 'rayproject/ray:2.10.0'
+                  command: [ "/bin/bash", "-lc", "--" ]
+                  args: [ "ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;" ]
                   lifecycle:
                     preStop:
                       exec:
                         command: [ "/bin/sh","-c","ray stop" ]
                   resources:
                     limits:
-                      cpu: "1000m"
-                      nvidia.com/gpu: 1
+                      cpu: 1000m
                     requests:
-                      cpu: "200m"
-                      nvidia.com/gpu: 1
+                      cpu: 200m
diff --git a/development/vllm/config/deployment.yaml b/development/vllm/config/deployment.yaml
@@ -0,0 +1,137 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mock-facebook-opt-125m
+  namespace: default
+  labels:
+    model.aibrix.ai/name: "facebook-opt-125m"
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adapter.model.aibrix.ai/enabled: "true"
+      model.aibrix.ai/name: "facebook-opt-125m"
+      app: "mock-facebook-opt-125m"
+  template:
+    metadata:
+      labels:
+        adapter.model.aibrix.ai/enabled: "true"
+        model.aibrix.ai/name: "facebook-opt-125m"
+        app: "mock-facebook-opt-125m"
+    spec:
+      serviceAccountName: mocked-app-sa
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-cpu-env:macos
+          ports:
+            - containerPort: 8000
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --uvicorn-log-level
+            - warning
+            - --model
+            - facebook/opt-125m 
+            - --served-model-name 
+            - facebook-opt-125m 
+            - --chat-template 
+            - /etc/chat-template-config/chat-template.j2 
+            - --trust-remote-code 
+            - --device 
+            - cpu 
+            - --disable_async_output_proc 
+            - --enforce-eager 
+            - --dtype 
+            - float16
+          env:
+            - name: DEPLOYMENT_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['app']
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: MY_POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          volumeMounts:
+          - name: model
+            mountPath: /root/.cache/huggingface
+          - name: chat-template-volume
+            mountPath: /etc/chat-template-config
+        - name: aibrix-runtime
+          image: aibrix/runtime:nightly
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: model
+          hostPath:
+            path: /root/.cache/huggingface
+        - name: chat-template-volume
+          configMap:
+            name: chat-template-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chat-template-config
+data:
+  chat-template.j2: |
+    {%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+    {%- else -%}
+        {% set system_message = '' -%}
+    {%- endif -%}
+
+    {{ bos_token + system_message }}
+    {%- for message in messages -%}
+        {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+            {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {%- endif -%}
+
+        {%- if message['role'] == 'user' -%}
+            {{ 'USER: ' + message['content'] + '\n' }}
+        {%- elif message['role'] == 'assistant' -%}
+            {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+        {%- endif -%}
+    {%- endfor -%}
+
+    {%- if add_generation_prompt -%}
+        {{ 'ASSISTANT:' }}
+    {% endif %}
diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -75,6 +75,8 @@ type Block struct {
 
 const (
 	modelIdentifier                       = "model.aibrix.ai/name"
+	nodeType                              = "ray.io/node-type"
+	nodeWorker                            = "worker"
 	podPort                               = 8000
 	defaultPodMetricRefreshIntervalInMS   = 50
 	expireWriteRequestTraceIntervalInMins = 10
@@ -296,6 +298,12 @@ func (c *Cache) addPod(obj interface{}) {
 	if !ok {
 		return
 	}
+	// ignore worker pods
+	nodeType, ok := pod.Labels[nodeType]
+	if ok && nodeType == nodeWorker {
+		klog.InfoS("ignored ray worker pod", "name", pod.Name)
+		return
+	}
 
 	c.Pods[pod.Name] = pod
 	c.addPodAndModelMappingLocked(pod.Name, modelName)
@@ -323,6 +331,20 @@ func (c *Cache) updatePod(oldObj interface{}, newObj interface{}) {
 		c.deletePodAndModelMapping(oldPod.Name, oldModelName)
 	}
 
+	// ignore worker pods
+	nodeType, ok := oldPod.Labels[nodeType]
+	if ok && nodeType == nodeWorker {
+		klog.InfoS("ignored ray worker pod", "name", oldPod.Name)
+		return
+	}
+
+	// ignore worker pods
+	nodeType, ok = newPod.Labels[nodeType]
+	if ok && nodeType == nodeWorker {
+		klog.InfoS("ignored ray worker pod", "name", newPod.Name)
+		return
+	}
+
 	// Add new mappings if present
 	if newOk {
 		c.Pods[newPod.Name] = newPod

diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -22,8 +22,11 @@ import (
 	"time"
 
 	autoscalingv1alpha1 "github.com/aibrix/aibrix/api/autoscaling/v1alpha1"
+	orchestrationv1alpha1 "github.com/aibrix/aibrix/api/orchestration/v1alpha1"
 	"github.com/aibrix/aibrix/pkg/config"
 	"github.com/aibrix/aibrix/pkg/controller/podautoscaler/metrics"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 
 	"github.com/aibrix/aibrix/pkg/controller/podautoscaler/scaler"
 	podutil "github.com/aibrix/aibrix/pkg/utils"
@@ -621,6 +624,16 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
 		return 0, "", currentTimestamp, err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return 0, "", currentTimestamp, err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	originalReadyPodsCount, err := scaler.GetReadyPodsCount(ctx, r.Client, pa.Namespace, labelsSelector)
 
 	if err != nil {
@@ -702,6 +715,16 @@ func (r *PodAutoscalerReconciler) updateMetricsForScale(ctx context.Context, pa
 		return err
 	}
 
+	// Append ray head worker requirement for label selector
+	if scale.GetAPIVersion() == orchestrationv1alpha1.GroupVersion.String() && scale.GetKind() == "RayClusterFleet" {
+		newRequirement, err := labels.NewRequirement("ray.io/node-type", selection.Equals, []string{"head"})
+		if err != nil {
+			klog.ErrorS(err, "Failed to add new requirements ray.io/node-type: head to label selector")
+			return err
+		}
+		labelsSelector = labelsSelector.Add(*newRequirement)
+	}
+
 	// Get pod list managed by scaleTargetRef
 	podList, err := podutil.GetPodListByLabelSelector(ctx, r.Client, pa.Namespace, labelsSelector)
 	if err != nil {
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ name: Docker Build Images @@
     on:
       pull_request:
-        branches: [ "main" ]
+        branches: [ "main", "release-*" ]
     jobs:
       build:
@@ Expand Down @@