From 54abfff2e063467cc4a81d73fa720c49d7de5a77 Mon Sep 17 00:00:00 2001
From: Gangmuk <gangmuk.lim@bytedance.com>
Date: Wed, 12 Feb 2025 17:22:05 -0800
Subject: [PATCH 1/3] fixed manifest with name without v100, recording per
 request log in pod log streaming, updated script to use deploy.yaml instead
 of reusing existing deployment

---
 benchmarks/autoscaling/README.md              |  4 +-
 .../apa.yaml                                  |  2 +-
 .../deploy.yaml                               | 40 +++++++++----------
 .../hpa.yaml                                  |  4 +-
 .../kpa.yaml                                  |  4 +-
 .../optimizer-kpa.yaml                        |  6 +--
 .../svc.yaml                                  |  0
 benchmarks/autoscaling/overnight_run.sh       |  3 +-
 benchmarks/autoscaling/run-test.sh            |  7 +++-
 .../autoscaling/streaming_pod_log_to_file.py  | 27 +++++++------
 10 files changed, 50 insertions(+), 47 deletions(-)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/apa.yaml (95%)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/deploy.yaml (83%)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/hpa.yaml (84%)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/kpa.yaml (86%)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/optimizer-kpa.yaml (79%)
 rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/svc.yaml (100%)

diff --git a/benchmarks/autoscaling/README.md b/benchmarks/autoscaling/README.md
index d71bb093..827ea337 100644
--- a/benchmarks/autoscaling/README.md
+++ b/benchmarks/autoscaling/README.md
@@ -7,7 +7,7 @@ You can run autoscaling benchmark experiment by simply running the command below
 
 
 What you have to check before running it
-- run deployment for your application (refer to `deepseek-llm-7b-chat-v100/deploy.yaml`)
+- run deployment for your application (refer to `deepseek-llm-7b-chat/deploy.yaml`)
 - change the name field under scaleTargetRed in all autoscaling yaml files.
 - check the deployment name in run-test.py
 
@@ -17,7 +17,7 @@ For example,
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: deepseek-llm-7b-chat-v100 (*this one)
+    name: deepseek-llm-7b-chat (*this one)
 ...
 ```
 
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
similarity index 95%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
index 40fa8918..44f02f56 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -23,4 +23,4 @@ spec:
     scaleTargetRef:
         apiVersion: apps/v1
         kind: Deployment
-        name: deepseek-llm-7b-chat-v100
\ No newline at end of file
+        name: deepseek-llm-7b-chat
\ No newline at end of file
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml
similarity index 83%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml
index b4b107cd..c503357e 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml
@@ -4,7 +4,7 @@ metadata:
   labels:
     model.aibrix.ai/name: deepseek-llm-7b-chat
     model.aibrix.ai/port: "8000"
-  name: deepseek-llm-7b-chat-v100
+  name: deepseek-llm-7b-chat
   namespace: default
 spec:
   replicas: 1
@@ -54,23 +54,23 @@ spec:
             successThreshold: 1
             timeoutSeconds: 1
           lifecycle:
-          preStop:
-            exec:
-              command:
-              - /bin/sh
-              - -c
-              - |
-                while true; do
-                  RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}')
-                  WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}')
-                  if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then
-                    echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
-                    exit 0
-                  else
-                    echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
-                    sleep 5
-                  fi
-                done
+            preStop:
+              exec:
+                command:
+                - /bin/sh
+                - -c
+                - |
+                  while true; do
+                    RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}')
+                    WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}')
+                    if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then
+                      echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                      exit 0
+                    else
+                      echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                      sleep 5
+                    fi
+                  done
           name: vllm-openai
           ports:
             - containerPort: 8000
@@ -127,7 +127,7 @@ spec:
             periodSeconds: 10
       initContainers:
         - name: init-model
-          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc2
+          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc.2
           command:
             - aibrix_download
             - --model-uri
@@ -158,7 +158,7 @@ spec:
           volumeMounts:
             - mountPath: /models
               name: model-hostpath
-      terminationGracePeriodSeconds: 300
+      terminationGracePeriodSeconds: 60
       volumes:
         - name: model-hostpath
           hostPath:
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
similarity index 84%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
index 97e83fb5..247b7841 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-    name: podautoscaler-deepseek-llm-7b-chat-v100-hpa
+    name: podautoscaler-deepseek-llm-7b-chat-hpa
     namespace: default
     labels:
         app.kubernetes.io/name: aibrix
@@ -20,4 +20,4 @@ spec:
     scaleTargetRef:
         apiVersion: apps/v1
         kind: Deployment
-        name: deepseek-llm-7b-chat-v100
\ No newline at end of file
+        name: deepseek-llm-7b-chat
\ No newline at end of file
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
similarity index 86%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
index a2bf4ee8..2cc74820 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-    name: podautoscaler-deepseek-llm-7b-chat-v100-kpa
+    name: podautoscaler-deepseek-llm-7b-chat-kpa
     namespace: default
     labels:
         app.kubernetes.io/name: aibrix
@@ -21,4 +21,4 @@ spec:
     scaleTargetRef:
         apiVersion: apps/v1
         kind: Deployment
-        name: deepseek-llm-7b-chat-v100
\ No newline at end of file
+        name: deepseek-llm-7b-chat
\ No newline at end of file
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
similarity index 79%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
index 2a4cd76a..80f55525 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-v100-gpu-optimizer
+  name: podautoscaler-deepseek-llm-7b-chat-gpu-optimizer
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -14,11 +14,11 @@ spec:
   metricsSources:
   - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
     metricSourceType: domain
-    path: /metrics/default/deepseek-llm-7b-chat-v100
+    path: /metrics/default/deepseek-llm-7b-chat
     protocolType: http
     targetMetric: vllm:deployment_replicas
     targetValue: "1" 
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: deepseek-llm-7b-chat-v100
+    name: deepseek-llm-7b-chat
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml
similarity index 100%
rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml
rename to benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml
diff --git a/benchmarks/autoscaling/overnight_run.sh b/benchmarks/autoscaling/overnight_run.sh
index b553f25e..3870dfaf 100755
--- a/benchmarks/autoscaling/overnight_run.sh
+++ b/benchmarks/autoscaling/overnight_run.sh
@@ -7,8 +7,7 @@ if [ -z "${workload_path}" ]; then
     exit 1
 fi
 
-# autoscalers="hpa kpa apa optimizer-kpa"
-autoscalers="apa optimizer-kpa"
+autoscalers="hpa kpa apa optimizer-kpa"
 for autoscaler in ${autoscalers}; do
     start_time=$(date +%s)
     echo "--------------------------------"
diff --git a/benchmarks/autoscaling/run-test.sh b/benchmarks/autoscaling/run-test.sh
index 64e3ffb7..229ca3cc 100755
--- a/benchmarks/autoscaling/run-test.sh
+++ b/benchmarks/autoscaling/run-test.sh
@@ -4,8 +4,8 @@ input_workload_path=$1
 autoscaler=$2
 aibrix_repo="/Users/bytedance/projects/aibrix-2" # root dir of aibrix repo
 api_key="sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi" # set your api key
-k8s_yaml_dir="deepseek-llm-7b-chat-v100"
-target_deployment="deepseek-llm-7b-chat-v100" # "aibrix-model-deepseek-llm-7b-chat"
+k8s_yaml_dir="deepseek-llm-7b-chat"
+target_deployment="deepseek-llm-7b-chat" # "aibrix-model-deepseek-llm-7b-chat"
 target_ai_model=deepseek-llm-7b-chat
 
 echo "Make sure ${target_deployment} is the right deployment."
@@ -60,8 +60,10 @@ echo "started port-forwarding with PID: $PORT_FORWARD_PID"
 # Clean up any existing autoscalers
 kubectl delete podautoscaler --all --all-namespaces
 kubectl delete hpa --all --all-namespaces
+kubectl delete -f ${k8s_yaml_dir}/deploy.yaml
 
 # Apply new autoscaler
+kubectl apply -f ${k8s_yaml_dir}/deploy.yaml
 kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml
 echo "kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml"
 python set_num_replicas.py --deployment ${target_deployment} --replicas 1
@@ -122,6 +124,7 @@ sleep 1
 # Cleanup
 kubectl delete podautoscaler --all --all-namespaces
 python set_num_replicas.py --deployment ${target_deployment} --replicas 1
+kubectl delete -f ${k8s_yaml_dir}/deploy.yaml
 
 # Stop monitoring processes
 echo "Stopping monitoring processes..."
diff --git a/benchmarks/autoscaling/streaming_pod_log_to_file.py b/benchmarks/autoscaling/streaming_pod_log_to_file.py
index fd48b0e9..47b098ec 100644
--- a/benchmarks/autoscaling/streaming_pod_log_to_file.py
+++ b/benchmarks/autoscaling/streaming_pod_log_to_file.py
@@ -11,20 +11,22 @@ def get_all_pods(namespace):
     pod_list = pod_list_output.decode('utf-8').split()
     return pod_list
 
-def write_logs(keyword, fname, process):
+def write_logs(keywords, fname, process):
     with open(fname, 'w') as log_file:
         while True:
             line = process.stdout.readline()
             if not line:
                 break
-            if keyword is None:
-                # If there is no keyword, write all logs
-                log_file.write(line)
-                log_file.flush()
-            if keyword and keyword in line:
-                # If there is keyword, write only the lines containing the keyword
+            if len(keywords) == 0: # If there is no keyword, write all logs
                 log_file.write(line)
                 log_file.flush()
+            else:
+                for keyword in keywords:
+                    if keyword in line:
+                        # If there is keyword, write only the lines containing the keyword
+                        log_file.write(line)
+                        log_file.flush()
+                        break
 
 def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace):
     if not os.path.exists(pod_log_dir):
@@ -38,12 +40,11 @@ def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace):
         stderr=subprocess.PIPE,
         universal_newlines=True
     )
-    # if namespace == "default":
-    #     keyword = "Avg prompt throughput:"
-    # else:
-    #     keyword = None
-    keyword = None # you can specify keyword here to filter logs
-    log_thread = threading.Thread(target=write_logs, args=(keyword, fname, process))
+    if namespace == "default":
+        keywords = ["Avg prompt throughput:", "logger.py", "engine.py"]
+    else:
+        keywords = []
+    log_thread = threading.Thread(target=write_logs, args=(keywords, fname, process))
     log_thread.start()
     return process, log_thread
 

From 8f16b38a4c0783b831267031bb602efe69bd3739 Mon Sep 17 00:00:00 2001
From: Gangmuk <gangmuk.lim@bytedance.com>
Date: Wed, 12 Feb 2025 23:24:40 -0800
Subject: [PATCH 2/3] fixed the indentations (two spaces)

---
 .../autoscaling/deepseek-llm-7b-chat/apa.yaml | 44 +++++++++----------
 .../autoscaling/deepseek-llm-7b-chat/hpa.yaml | 38 ++++++++--------
 .../autoscaling/deepseek-llm-7b-chat/kpa.yaml | 40 ++++++++---------
 3 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
index 44f02f56..d644cc54 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -1,26 +1,26 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-    name: podautoscaler-deepseek-llm-7b-chat-v100-apa
-    namespace: default
-    labels:
-        app.kubernetes.io/name: aibrix
-        app.kubernetes.io/managed-by: kustomize
-        autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1"
-        autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2"
-        apa.autoscaling.aibrix.ai/window: "30s"
+  name: podautoscaler-deepseek-llm-7b-chat-v100-apa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+    autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1"
+    autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2"
+    apa.autoscaling.aibrix.ai/window: "30s"
 spec:
-    scalingStrategy: "APA"
-    minReplicas: 1
-    maxReplicas: 10
-    metricsSources:
-    - metricSourceType: "pod"
-      protocolType: "http"
-      port: "8000"
-      path: "metrics"
-      targetMetric: "gpu_cache_usage_perc"
-      targetValue: "0.5"
-    scaleTargetRef:
-        apiVersion: apps/v1
-        kind: Deployment
-        name: deepseek-llm-7b-chat
\ No newline at end of file
+  scalingStrategy: "APA"
+  minReplicas: 1
+  maxReplicas: 10
+  metricsSources:
+  - metricSourceType: "pod"
+    protocolType: "http"
+    port: "8000"
+    path: "metrics"
+    targetMetric: "gpu_cache_usage_perc"
+    targetValue: "0.5"
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
\ No newline at end of file
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
index 247b7841..0e644a32 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -1,23 +1,23 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-    name: podautoscaler-deepseek-llm-7b-chat-hpa
-    namespace: default
-    labels:
-        app.kubernetes.io/name: aibrix
-        app.kubernetes.io/managed-by: kustomize
+  name: podautoscaler-deepseek-llm-7b-chat-hpa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
 spec:
-    scalingStrategy: "HPA"
-    minReplicas: 1
-    maxReplicas: 10
-    metricsSources:
-    - metricSourceType: "pod"
-      protocolType: "http"
-      port: "8000"
-      path: "/metrics"
-      targetMetric: "gpu_cache_usage_perc"
-      targetValue: "50"
-    scaleTargetRef:
-        apiVersion: apps/v1
-        kind: Deployment
-        name: deepseek-llm-7b-chat
\ No newline at end of file
+  scalingStrategy: "HPA"
+  minReplicas: 1
+  maxReplicas: 10
+  metricsSources:
+  - metricSourceType: "pod"
+    protocolType: "http"
+    port: "8000"
+    path: "/metrics"
+    targetMetric: "gpu_cache_usage_perc"
+    targetValue: "50"
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
\ No newline at end of file
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
index 2cc74820..c5e7359c 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -1,24 +1,24 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-    name: podautoscaler-deepseek-llm-7b-chat-kpa
-    namespace: default
-    labels:
-        app.kubernetes.io/name: aibrix
-        app.kubernetes.io/managed-by: kustomize
-        kpa.autoscaling.aibrix.ai/scale-down-delay: "3m"
+  name: podautoscaler-deepseek-llm-7b-chat-kpa
+  namespace: default
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+    kpa.autoscaling.aibrix.ai/scale-down-delay: "3m"
 spec:
-    scalingStrategy: "KPA"
-    minReplicas: 1
-    maxReplicas: 10
-    metricsSources:
-    - metricSourceType: "pod"
-      protocolType: "http"
-      port: "8000"
-      path: "metrics"
-      targetMetric: "gpu_cache_usage_perc"
-      targetValue: "0.5"
-    scaleTargetRef:
-        apiVersion: apps/v1
-        kind: Deployment
-        name: deepseek-llm-7b-chat
\ No newline at end of file
+  scalingStrategy: "KPA"
+  minReplicas: 1
+  maxReplicas: 10
+  metricsSources:
+  - metricSourceType: "pod"
+    protocolType: "http"
+    port: "8000"
+    path: "metrics"
+    targetMetric: "gpu_cache_usage_perc"
+    targetValue: "0.5"
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: deepseek-llm-7b-chat
\ No newline at end of file

From b6455354bf3849f6075f59d7d7362b59fc3bda31 Mon Sep 17 00:00:00 2001
From: Gangmuk <gangmuk.lim@bytedance.com>
Date: Thu, 13 Feb 2025 10:01:55 -0800
Subject: [PATCH 3/3] Reformat autoscaler yamls based on #665

---
 .../autoscaling/deepseek-llm-7b-chat/apa.yaml | 25 ++++++++++---------
 .../autoscaling/deepseek-llm-7b-chat/hpa.yaml | 18 ++++++-------
 .../autoscaling/deepseek-llm-7b-chat/kpa.yaml | 21 ++++++++--------
 .../deepseek-llm-7b-chat/optimizer-kpa.yaml   | 17 +++++++------
 4 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
index d644cc54..a81d1381 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -1,25 +1,26 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-v100-apa
+  name: deepseek-llm-7b-chat-v100-apa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
-    autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1"
-    autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2"
-    apa.autoscaling.aibrix.ai/window: "30s"
+  annotations:
+    autoscaling.aibrix.ai/up-fluctuation-tolerance: '0.1'
+    autoscaling.aibrix.ai/down-fluctuation-tolerance: '0.2'
+    apa.autoscaling.aibrix.ai/window: 30s
 spec:
-  scalingStrategy: "APA"
+  scalingStrategy: APA
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 8
   metricsSources:
-  - metricSourceType: "pod"
-    protocolType: "http"
-    port: "8000"
-    path: "metrics"
-    targetMetric: "gpu_cache_usage_perc"
-    targetValue: "0.5"
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
index 0e644a32..55adfc00 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -1,22 +1,22 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-hpa
+  name: deepseek-llm-7b-chat-hpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
 spec:
-  scalingStrategy: "HPA"
+  scalingStrategy: HPA
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 8
   metricsSources:
-  - metricSourceType: "pod"
-    protocolType: "http"
-    port: "8000"
-    path: "/metrics"
-    targetMetric: "gpu_cache_usage_perc"
-    targetValue: "50"
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: /metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '50'
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
index c5e7359c..c49d4546 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -1,23 +1,24 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-kpa
+  name: deepseek-llm-7b-chat-kpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
-    kpa.autoscaling.aibrix.ai/scale-down-delay: "3m"
+  annotations:
+    kpa.autoscaling.aibrix.ai/scale-down-delay: 3m
 spec:
-  scalingStrategy: "KPA"
+  scalingStrategy: KPA
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 8
   metricsSources:
-  - metricSourceType: "pod"
-    protocolType: "http"
-    port: "8000"
-    path: "metrics"
-    targetMetric: "gpu_cache_usage_perc"
-    targetValue: "0.5"
+    - metricSourceType: pod
+      protocolType: http
+      port: '8000'
+      path: metrics
+      targetMetric: gpu_cache_usage_perc
+      targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
index 80f55525..e26bd7d8 100644
--- a/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
+++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml
@@ -1,23 +1,24 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: podautoscaler-deepseek-llm-7b-chat-gpu-optimizer
+  name: deepseek-llm-7b-chat-gpu-optimizer
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
+  annotations:
     kpa.autoscaling.aibrix.ai/scale-down-delay: 0s
 spec:
   scalingStrategy: KPA 
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 8
   metricsSources:
-  - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
-    metricSourceType: domain
-    path: /metrics/default/deepseek-llm-7b-chat
-    protocolType: http
-    targetMetric: vllm:deployment_replicas
-    targetValue: "1" 
+    - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
+      metricSourceType: domain
+      path: /metrics/default/deepseek-llm-7b-chat
+      protocolType: http
+      targetMetric: vllm:deployment_replicas
+      targetValue: "1" 
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment