vllm-project · Jeffwan · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/benchmarks/autoscaling/README.md b/benchmarks/autoscaling/README.md
@@ -7,7 +7,7 @@ You can run autoscaling benchmark experiment by simply running the command below
 
 
 What you have to check before running it
-- run deployment for your application (refer to `deepseek-llm-7b-chat-v100/deploy.yaml`)
+- run deployment for your application (refer to `deepseek-llm-7b-chat/deploy.yaml`)
 - change the name field under scaleTargetRed in all autoscaling yaml files.
 - check the deployment name in run-test.py
 
@@ -17,7 +17,7 @@ For example,
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: deepseek-llm-7b-chat-v100 (*this one)
+    name: deepseek-llm-7b-chat (*this one)
 ...
 ```
 

diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -0,0 +1,27 @@
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: deepseek-llm-7b-chat-v100-apa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  annotations:
+    autoscaling.aibrix.ai/up-fluctuation-tolerance: '0.1'
+    autoscaling.aibrix.ai/down-fluctuation-tolerance: '0.2'
+    apa.autoscaling.aibrix.ai/window: 30s
+spec:
+  scalingStrategy: APA
+  minReplicas: 1
+  maxReplicas: 8
+  metricsSources:
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '0.5'
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
diff --git a/...ing/deepseek-llm-7b-chat-v100/deploy.yaml → ...oscaling/deepseek-llm-7b-chat/deploy.yaml b/...ing/deepseek-llm-7b-chat-v100/deploy.yaml → ...oscaling/deepseek-llm-7b-chat/deploy.yaml
@@ -4,7 +4,7 @@ metadata:
   labels:
     model.aibrix.ai/name: deepseek-llm-7b-chat
     model.aibrix.ai/port: "8000"
-  name: deepseek-llm-7b-chat-v100
+  name: deepseek-llm-7b-chat
   namespace: default
 spec:
   replicas: 1
@@ -54,23 +54,23 @@ spec:
             successThreshold: 1
             timeoutSeconds: 1
           lifecycle:
-          preStop:
-            exec:
-              command:
-              - /bin/sh
-              - -c
-              - |
-                while true; do
-                  RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}')
-                  WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}')
-                  if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then
-                    echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
-                    exit 0
-                  else
-                    echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
-                    sleep 5
-                  fi
-                done
+            preStop:
+              exec:
+                command:
+                - /bin/sh
+                - -c
+                - |
+                  while true; do
+                    RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}')
+                    WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}')
+                    if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then
+                      echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                      exit 0
+                    else
+                      echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                      sleep 5
+                    fi
+                  done
           name: vllm-openai
           ports:
             - containerPort: 8000
@@ -127,7 +127,7 @@ spec:
             periodSeconds: 10
       initContainers:
         - name: init-model
-          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc2
+          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc.2
           command:
             - aibrix_download
             - --model-uri
@@ -158,7 +158,7 @@ spec:
           volumeMounts:
             - mountPath: /models
               name: model-hostpath
-      terminationGracePeriodSeconds: 300
+      terminationGracePeriodSeconds: 60
       volumes:
         - name: model-hostpath
           hostPath:

diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -0,0 +1,23 @@
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: deepseek-llm-7b-chat-hpa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  scalingStrategy: HPA
+  minReplicas: 1
+  maxReplicas: 8
+  metricsSources:
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: /metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '50'
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -0,0 +1,25 @@
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: deepseek-llm-7b-chat-kpa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  annotations:
+    kpa.autoscaling.aibrix.ai/scale-down-delay: 3m
+spec:
+  scalingStrategy: KPA
+  minReplicas: 1
+  maxReplicas: 8
+  metricsSources:
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '0.5'
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
diff --git a/...pseek-llm-7b-chat-v100/optimizer-kpa.yaml → ...g/deepseek-llm-7b-chat/optimizer-kpa.yaml b/...pseek-llm-7b-chat-v100/optimizer-kpa.yaml → ...g/deepseek-llm-7b-chat/optimizer-kpa.yaml
@@ -1,24 +1,25 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-v100-gpu-optimizer
+  name: deepseek-llm-7b-chat-gpu-optimizer
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
+  annotations:
     kpa.autoscaling.aibrix.ai/scale-down-delay: 0s
 spec:
   scalingStrategy: KPA 
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 8
   metricsSources:
-  - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
-    metricSourceType: domain
-    path: /metrics/default/deepseek-llm-7b-chat-v100
-    protocolType: http
-    targetMetric: vllm:deployment_replicas
-    targetValue: "1" 
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
+      metricSourceType: domain
+      path: /metrics/default/deepseek-llm-7b-chat
+      protocolType: http
+      targetMetric: vllm:deployment_replicas
+      targetValue: "1" 
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: deepseek-llm-7b-chat-v100
+    name: deepseek-llm-7b-chat
diff --git a/...caling/deepseek-llm-7b-chat-v100/svc.yaml → ...autoscaling/deepseek-llm-7b-chat/svc.yaml b/...caling/deepseek-llm-7b-chat-v100/svc.yaml → ...autoscaling/deepseek-llm-7b-chat/svc.yaml
diff --git a/benchmarks/autoscaling/overnight_run.sh b/benchmarks/autoscaling/overnight_run.sh
@@ -7,8 +7,7 @@ if [ -z "${workload_path}" ]; then
     exit 1
 fi
 
-# autoscalers="hpa kpa apa optimizer-kpa"
-autoscalers="apa optimizer-kpa"
+autoscalers="hpa kpa apa optimizer-kpa"
 for autoscaler in ${autoscalers}; do
     start_time=$(date +%s)
     echo "--------------------------------"

diff --git a/benchmarks/autoscaling/run-test.sh b/benchmarks/autoscaling/run-test.sh
@@ -4,8 +4,8 @@ input_workload_path=$1
 autoscaler=$2
 aibrix_repo="/Users/bytedance/projects/aibrix-2" # root dir of aibrix repo
 api_key="sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi" # set your api key
-k8s_yaml_dir="deepseek-llm-7b-chat-v100"
-target_deployment="deepseek-llm-7b-chat-v100" # "aibrix-model-deepseek-llm-7b-chat"
+k8s_yaml_dir="deepseek-llm-7b-chat"
+target_deployment="deepseek-llm-7b-chat" # "aibrix-model-deepseek-llm-7b-chat"
 target_ai_model=deepseek-llm-7b-chat
 
 echo "Make sure ${target_deployment} is the right deployment."
@@ -60,8 +60,10 @@ echo "started port-forwarding with PID: $PORT_FORWARD_PID"
 # Clean up any existing autoscalers
 kubectl delete podautoscaler --all --all-namespaces
 kubectl delete hpa --all --all-namespaces
+kubectl delete -f ${k8s_yaml_dir}/deploy.yaml
 
 # Apply new autoscaler
+kubectl apply -f ${k8s_yaml_dir}/deploy.yaml
 kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml
 echo "kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml"
 python set_num_replicas.py --deployment ${target_deployment} --replicas 1
@@ -122,6 +124,7 @@ sleep 1
 # Cleanup
 kubectl delete podautoscaler --all --all-namespaces
 python set_num_replicas.py --deployment ${target_deployment} --replicas 1
+kubectl delete -f ${k8s_yaml_dir}/deploy.yaml
 
 # Stop monitoring processes
 echo "Stopping monitoring processes..."

diff --git a/benchmarks/autoscaling/streaming_pod_log_to_file.py b/benchmarks/autoscaling/streaming_pod_log_to_file.py
@@ -11,20 +11,22 @@ def get_all_pods(namespace):
     pod_list = pod_list_output.decode('utf-8').split()
     return pod_list
 
-def write_logs(keyword, fname, process):
+def write_logs(keywords, fname, process):
     with open(fname, 'w') as log_file:
         while True:
             line = process.stdout.readline()
             if not line:
                 break
-            if keyword is None:
-                # If there is no keyword, write all logs
-                log_file.write(line)
-                log_file.flush()
-            if keyword and keyword in line:
-                # If there is keyword, write only the lines containing the keyword
+            if len(keywords) == 0: # If there is no keyword, write all logs
                 log_file.write(line)
                 log_file.flush()
+            else:
+                for keyword in keywords:
+                    if keyword in line:
+                        # If there is keyword, write only the lines containing the keyword
+                        log_file.write(line)
+                        log_file.flush()
+                        break
 
 def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace):
     if not os.path.exists(pod_log_dir):
@@ -38,12 +40,11 @@ def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace):
         stderr=subprocess.PIPE,
         universal_newlines=True
     )
-    # if namespace == "default":
-    #     keyword = "Avg prompt throughput:"
-    # else:
-    #     keyword = None
-    keyword = None # you can specify keyword here to filter logs
-    log_thread = threading.Thread(target=write_logs, args=(keyword, fname, process))
+    if namespace == "default":
+        keywords = ["Avg prompt throughput:", "logger.py", "engine.py"]
+    else:
+        keywords = []
+    log_thread = threading.Thread(target=write_logs, args=(keywords, fname, process))
     log_thread.start()
     return process, log_thread