Improve lora autoscaling and kvcache examples (#697)

* Fix issues in lora and autoscaling samples * Change kv cache to pending status and schedule first * Update kv cache examples with correct names * Add missing service for kv cache * Update image for distributed inference
vllm-project · Feb 18, 2025 · a2c89c9 · a2c89c9
1 parent a1b389f
commit a2c89c9
Show file tree

Hide file tree

Showing 15 changed files with 264 additions and 36 deletions.
diff --git a/api/orchestration/v1alpha1/kvcache_types.go b/api/orchestration/v1alpha1/kvcache_types.go
@@ -95,7 +95,7 @@ type CacheSpec struct {
 
 	// the memory resources of kvcache container
 	// +kubebuilder:validation:Optional
-	// +kubebuilder:default:="2"
+	// +kubebuilder:default:="2Gi"
 	Memory string `json:"memory,omitempty"`
 
 	// the cpu resources of kvcache container

diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst
@@ -30,10 +30,13 @@ After deployment, we can see all the components by using ``kubectl get pods -n a
 .. code-block:: RST
 
     NAME                                        READY   STATUS    RESTARTS   AGE
-    deepseek-coder-7b-kvcache-596965997-p86cx   1/1     Running   0          2m
+    deepseek-coder-7b-kvcache-596965997-p86cx   0/1     Pending   0          2m
     deepseek-coder-7b-kvcache-etcd-0            1/1     Running   0          2m
 
-After all components are running, we can use the following yaml to deploy the inference service:
+.. note::
+    ``deepseek-coder-7b-kvcache-596965997-p86cx`` is pending and waiting for inference engine to be deployed, this is normal.
+
+After all components are created, we can use the following yaml to deploy the inference service:
 
 .. literalinclude:: ../../../samples/kvcache/deployment.yaml
    :language: yaml

diff --git a/docs/source/features/lora-dynamic-loading.rst b/docs/source/features/lora-dynamic-loading.rst
@@ -113,10 +113,6 @@ Send request using lora model name to the gateway.
 
 .. code-block:: bash
 
-    # Expose endpoint
-    LB_IP=$(kubectl get svc/envoy-aibrix-system-aibrix-eg-903790dc -n envoy-gateway-system -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-    ENDPOINT="${LB_IP}:80"
-
     # send request to base model
     curl -v http://${ENDPOINT}/v1/completions \
         -H "Content-Type: application/json" \
@@ -168,14 +164,33 @@ User may pass in the argument ``--api-key`` or environment variable ``VLLM_API_K
 
 .. code-block:: bash
 
-    python3 -m vllm.entrypoints.openai.api_server --api-key test-key-1234567890
+    python3 -m vllm.entrypoints.openai.api_server --api-key sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
+
+We already have an example and you can ``kubectl apply -f samples/adapter/adapter-with-key.yaml``.
 
 
 In that case, lora model adapter can not query the vLLM server correctly, showing ``{"error":"Unauthorized"}`` error. You need to update ``additionalConfig`` field to pass in the API key.
 
 .. literalinclude:: ../../../samples/adapter/adapter-api-key.yaml
    :language: yaml
 
+
+You need to send the request with ``--header 'Authorization: Bearer your-api-key'``
+
+.. code-block:: bash
+
+    # send request to base model
+    curl -v http://${ENDPOINT}/v1/completions \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake" \
+        -d '{
+            "model": "qwen-code-lora-with-key",
+            "prompt": "San Francisco is a",
+            "max_tokens": 128,
+            "temperature": 0
+        }'
+
+
 Runtime Support Sidecar
 ^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/samples/adapter/adapter-api-key.yaml b/samples/adapter/adapter-api-key.yaml
@@ -1,10 +1,10 @@
 apiVersion: model.aibrix.ai/v1alpha1
 kind: ModelAdapter
 metadata:
-  name: qwen-code-lora
+  name: qwen-code-lora-with-key
   namespace: default
   labels:
-    model.aibrix.ai/name: "qwen-code-lora"
+    model.aibrix.ai/name: "qwen-code-lora-with-key"
     model.aibrix.ai/port: "8000"
 spec:
   baseModel: qwen-coder-1-5b-instruct
@@ -13,5 +13,5 @@ spec:
       model.aibrix.ai/name: qwen-coder-1-5b-instruct
   artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora
   additionalConfig:
-    api-key: test-key-1234567890
+    api-key: sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
   schedulerName: default
diff --git a/samples/adapter/base-api-key.yaml b/samples/adapter/base-api-key.yaml
@@ -0,0 +1,103 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct # Note: The label value `model.aibrix.ai/name` here must match with the service name.
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+  name: qwen-coder-1-5b-instruct
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: qwen-coder-1-5b-instruct
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: qwen-coder-1-5b-instruct
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - Qwen/Qwen2.5-Coder-1.5B-Instruct
+            - --served-model-name
+            # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
+            - qwen-coder-1-5b-instruct
+            - --api-key
+            - sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
+            - --enable-lora
+          image: vllm/vllm-openai:v0.7.1
+          imagePullPolicy: Always
+          name: vllm-openai
+          env:
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "True"
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+        - name: aibrix-runtime
+          image: aibrix/runtime:v0.2.0
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: qwen-coder-1-5b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct
+  type: ClusterIP
diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-apa
+  name: deepseek-r1-distill-llama-8b-apa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -24,4 +24,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/autoscaling/deploy.yaml b/samples/autoscaling/deploy.yaml
@@ -2,15 +2,15 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
   labels:
-    model.aibrix.ai/name: deepseek-llm-7b-chat
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
     model.aibrix.ai/port: "8000"
-  name: aibrix-model-deepseek-llm-7b-chat
+  name: deepseek-r1-distill-llama-8b
   namespace: default
 spec:
   replicas: 1
   selector:
     matchLabels:
-      model.aibrix.ai/name: deepseek-llm-7b-chat
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
   strategy:
     type: Recreate
   template:
@@ -20,7 +20,7 @@ spec:
         prometheus.io/port: "8000"
         prometheus.io/path: "/metrics"
       labels:
-        model.aibrix.ai/name: deepseek-llm-7b-chat
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
     spec:
       containers:
         - command:
@@ -32,12 +32,13 @@ spec:
             - --port
             - "8000"
             - --model
-            - deepseek-ai/deepseek-coder-6.7b-instruct
+            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
             - --served-model-name
-            - deepseek-llm-7b-chat
-            - --trust-remote-code
+            - deepseek-r1-distill-llama-8b
             - --dtype
             - half
+            - --max-model-len
+            - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue.
           image: vllm/vllm-openai:v0.7.1
           imagePullPolicy: IfNotPresent
           livenessProbe:
@@ -46,7 +47,7 @@ spec:
               path: /health
               port: 8000
               scheme: HTTP
-            initialDelaySeconds: 90
+            initialDelaySeconds: 120
             periodSeconds: 5
             successThreshold: 1
             timeoutSeconds: 1
@@ -55,12 +56,12 @@ spec:
             - containerPort: 8000
               protocol: TCP
           readinessProbe:
-            failureThreshold: 3
+            failureThreshold: 5
             httpGet:
               path: /health
               port: 8000
               scheme: HTTP
-            initialDelaySeconds: 90
+            initialDelaySeconds: 120
             periodSeconds: 5
             successThreshold: 1
             timeoutSeconds: 1
@@ -69,3 +70,30 @@ spec:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-r1-distill-llama-8b # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  type: ClusterIP
diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-hpa
+  name: deepseek-r1-distill-llama-8b-hpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -20,4 +20,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/autoscaling/kpa.yaml b/samples/autoscaling/kpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-kpa
+  name: deepseek-r1-distill-llama-8b-kpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -22,4 +22,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/distributed/fleet-two-node.yaml b/samples/distributed/fleet-two-node.yaml
@@ -4,12 +4,12 @@ metadata:
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
-  name: facebook-opt-13b
+  name: qwen-coder-7b-instruct
 spec:
   replicas: 1
   selector:
     matchLabels:
-      model.aibrix.ai/name: facebook-opt-13b
+      model.aibrix.ai/name: qwen-coder-7b-instruct
   strategy:
     rollingUpdate:
       maxSurge: 25%
@@ -18,7 +18,7 @@ spec:
   template:
     metadata:
       labels:
-        model.aibrix.ai/name: facebook-opt-13b
+        model.aibrix.ai/name: qwen-coder-7b-instruct
       annotations:
         ray.io/overwrite-container-cmd: "true"
     spec:
@@ -41,7 +41,7 @@ spec:
                   - containerPort: 8000
                     name: service
                 command: ["/bin/bash", "-lc", "--"]
-                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --tensor-parallel-size 2 --distributed-executor-backend ray"]
                 resources:
                   limits:
                     cpu: "4"