From 54abfff2e063467cc4a81d73fa720c49d7de5a77 Mon Sep 17 00:00:00 2001 From: Gangmuk Date: Wed, 12 Feb 2025 17:22:05 -0800 Subject: [PATCH 1/3] fixed manifest with name without v100, recording per request log in pod log streaming, updated script to use deploy.yaml instead of reusing existing deployment --- benchmarks/autoscaling/README.md | 4 +- .../apa.yaml | 2 +- .../deploy.yaml | 40 +++++++++---------- .../hpa.yaml | 4 +- .../kpa.yaml | 4 +- .../optimizer-kpa.yaml | 6 +-- .../svc.yaml | 0 benchmarks/autoscaling/overnight_run.sh | 3 +- benchmarks/autoscaling/run-test.sh | 7 +++- .../autoscaling/streaming_pod_log_to_file.py | 27 +++++++------ 10 files changed, 50 insertions(+), 47 deletions(-) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/apa.yaml (95%) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/deploy.yaml (83%) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/hpa.yaml (84%) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/kpa.yaml (86%) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/optimizer-kpa.yaml (79%) rename benchmarks/autoscaling/{deepseek-llm-7b-chat-v100 => deepseek-llm-7b-chat}/svc.yaml (100%) diff --git a/benchmarks/autoscaling/README.md b/benchmarks/autoscaling/README.md index d71bb093..827ea337 100644 --- a/benchmarks/autoscaling/README.md +++ b/benchmarks/autoscaling/README.md @@ -7,7 +7,7 @@ You can run autoscaling benchmark experiment by simply running the command below What you have to check before running it -- run deployment for your application (refer to `deepseek-llm-7b-chat-v100/deploy.yaml`) +- run deployment for your application (refer to `deepseek-llm-7b-chat/deploy.yaml`) - change the name field under scaleTargetRed in all autoscaling yaml files. - check the deployment name in run-test.py @@ -17,7 +17,7 @@ For example, scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 (*this one) + name: deepseek-llm-7b-chat (*this one) ... ``` diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml similarity index 95% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml index 40fa8918..44f02f56 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml @@ -23,4 +23,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml similarity index 83% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml index b4b107cd..c503357e 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml @@ -4,7 +4,7 @@ metadata: labels: model.aibrix.ai/name: deepseek-llm-7b-chat model.aibrix.ai/port: "8000" - name: deepseek-llm-7b-chat-v100 + name: deepseek-llm-7b-chat namespace: default spec: replicas: 1 @@ -54,23 +54,23 @@ spec: successThreshold: 1 timeoutSeconds: 1 lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - | - while true; do - RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}') - WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}') - if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then - echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 - exit 0 - else - echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 - sleep 5 - fi - done + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done name: vllm-openai ports: - containerPort: 8000 @@ -127,7 +127,7 @@ spec: periodSeconds: 10 initContainers: - name: init-model - image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc2 + image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc.2 command: - aibrix_download - --model-uri @@ -158,7 +158,7 @@ spec: volumeMounts: - mountPath: /models name: model-hostpath - terminationGracePeriodSeconds: 300 + terminationGracePeriodSeconds: 60 volumes: - name: model-hostpath hostPath: diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml similarity index 84% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml index 97e83fb5..247b7841 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-hpa + name: podautoscaler-deepseek-llm-7b-chat-hpa namespace: default labels: app.kubernetes.io/name: aibrix @@ -20,4 +20,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml similarity index 86% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml index a2bf4ee8..2cc74820 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-kpa + name: podautoscaler-deepseek-llm-7b-chat-kpa namespace: default labels: app.kubernetes.io/name: aibrix @@ -21,4 +21,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml similarity index 79% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml index 2a4cd76a..80f55525 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-gpu-optimizer + name: podautoscaler-deepseek-llm-7b-chat-gpu-optimizer namespace: default labels: app.kubernetes.io/name: aibrix @@ -14,11 +14,11 @@ spec: metricsSources: - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 metricSourceType: domain - path: /metrics/default/deepseek-llm-7b-chat-v100 + path: /metrics/default/deepseek-llm-7b-chat protocolType: http targetMetric: vllm:deployment_replicas targetValue: "1" scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 + name: deepseek-llm-7b-chat diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml similarity index 100% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml diff --git a/benchmarks/autoscaling/overnight_run.sh b/benchmarks/autoscaling/overnight_run.sh index b553f25e..3870dfaf 100755 --- a/benchmarks/autoscaling/overnight_run.sh +++ b/benchmarks/autoscaling/overnight_run.sh @@ -7,8 +7,7 @@ if [ -z "${workload_path}" ]; then exit 1 fi -# autoscalers="hpa kpa apa optimizer-kpa" -autoscalers="apa optimizer-kpa" +autoscalers="hpa kpa apa optimizer-kpa" for autoscaler in ${autoscalers}; do start_time=$(date +%s) echo "--------------------------------" diff --git a/benchmarks/autoscaling/run-test.sh b/benchmarks/autoscaling/run-test.sh index 64e3ffb7..229ca3cc 100755 --- a/benchmarks/autoscaling/run-test.sh +++ b/benchmarks/autoscaling/run-test.sh @@ -4,8 +4,8 @@ input_workload_path=$1 autoscaler=$2 aibrix_repo="/Users/bytedance/projects/aibrix-2" # root dir of aibrix repo api_key="sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi" # set your api key -k8s_yaml_dir="deepseek-llm-7b-chat-v100" -target_deployment="deepseek-llm-7b-chat-v100" # "aibrix-model-deepseek-llm-7b-chat" +k8s_yaml_dir="deepseek-llm-7b-chat" +target_deployment="deepseek-llm-7b-chat" # "aibrix-model-deepseek-llm-7b-chat" target_ai_model=deepseek-llm-7b-chat echo "Make sure ${target_deployment} is the right deployment." @@ -60,8 +60,10 @@ echo "started port-forwarding with PID: $PORT_FORWARD_PID" # Clean up any existing autoscalers kubectl delete podautoscaler --all --all-namespaces kubectl delete hpa --all --all-namespaces +kubectl delete -f ${k8s_yaml_dir}/deploy.yaml # Apply new autoscaler +kubectl apply -f ${k8s_yaml_dir}/deploy.yaml kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml echo "kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml" python set_num_replicas.py --deployment ${target_deployment} --replicas 1 @@ -122,6 +124,7 @@ sleep 1 # Cleanup kubectl delete podautoscaler --all --all-namespaces python set_num_replicas.py --deployment ${target_deployment} --replicas 1 +kubectl delete -f ${k8s_yaml_dir}/deploy.yaml # Stop monitoring processes echo "Stopping monitoring processes..." diff --git a/benchmarks/autoscaling/streaming_pod_log_to_file.py b/benchmarks/autoscaling/streaming_pod_log_to_file.py index fd48b0e9..47b098ec 100644 --- a/benchmarks/autoscaling/streaming_pod_log_to_file.py +++ b/benchmarks/autoscaling/streaming_pod_log_to_file.py @@ -11,20 +11,22 @@ def get_all_pods(namespace): pod_list = pod_list_output.decode('utf-8').split() return pod_list -def write_logs(keyword, fname, process): +def write_logs(keywords, fname, process): with open(fname, 'w') as log_file: while True: line = process.stdout.readline() if not line: break - if keyword is None: - # If there is no keyword, write all logs - log_file.write(line) - log_file.flush() - if keyword and keyword in line: - # If there is keyword, write only the lines containing the keyword + if len(keywords) == 0: # If there is no keyword, write all logs log_file.write(line) log_file.flush() + else: + for keyword in keywords: + if keyword in line: + # If there is keyword, write only the lines containing the keyword + log_file.write(line) + log_file.flush() + break def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace): if not os.path.exists(pod_log_dir): @@ -38,12 +40,11 @@ def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace): stderr=subprocess.PIPE, universal_newlines=True ) - # if namespace == "default": - # keyword = "Avg prompt throughput:" - # else: - # keyword = None - keyword = None # you can specify keyword here to filter logs - log_thread = threading.Thread(target=write_logs, args=(keyword, fname, process)) + if namespace == "default": + keywords = ["Avg prompt throughput:", "logger.py", "engine.py"] + else: + keywords = [] + log_thread = threading.Thread(target=write_logs, args=(keywords, fname, process)) log_thread.start() return process, log_thread From 8f16b38a4c0783b831267031bb602efe69bd3739 Mon Sep 17 00:00:00 2001 From: Gangmuk Date: Wed, 12 Feb 2025 23:24:40 -0800 Subject: [PATCH 2/3] fixed the indentations (two spaces) --- .../autoscaling/deepseek-llm-7b-chat/apa.yaml | 44 +++++++++---------- .../autoscaling/deepseek-llm-7b-chat/hpa.yaml | 38 ++++++++-------- .../autoscaling/deepseek-llm-7b-chat/kpa.yaml | 40 ++++++++--------- 3 files changed, 61 insertions(+), 61 deletions(-) diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml index 44f02f56..d644cc54 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml @@ -1,26 +1,26 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-apa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1" - autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2" - apa.autoscaling.aibrix.ai/window: "30s" + name: podautoscaler-deepseek-llm-7b-chat-v100-apa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1" + autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2" + apa.autoscaling.aibrix.ai/window: "30s" spec: - scalingStrategy: "APA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat \ No newline at end of file + scalingStrategy: "APA" + minReplicas: 1 + maxReplicas: 10 + metricsSources: + - metricSourceType: "pod" + protocolType: "http" + port: "8000" + path: "metrics" + targetMetric: "gpu_cache_usage_perc" + targetValue: "0.5" + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml index 247b7841..0e644a32 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml @@ -1,23 +1,23 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-hpa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize + name: podautoscaler-deepseek-llm-7b-chat-hpa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize spec: - scalingStrategy: "HPA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "/metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "50" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat \ No newline at end of file + scalingStrategy: "HPA" + minReplicas: 1 + maxReplicas: 10 + metricsSources: + - metricSourceType: "pod" + protocolType: "http" + port: "8000" + path: "/metrics" + targetMetric: "gpu_cache_usage_perc" + targetValue: "50" + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml index 2cc74820..c5e7359c 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml @@ -1,24 +1,24 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-kpa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - kpa.autoscaling.aibrix.ai/scale-down-delay: "3m" + name: podautoscaler-deepseek-llm-7b-chat-kpa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + kpa.autoscaling.aibrix.ai/scale-down-delay: "3m" spec: - scalingStrategy: "KPA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat \ No newline at end of file + scalingStrategy: "KPA" + minReplicas: 1 + maxReplicas: 10 + metricsSources: + - metricSourceType: "pod" + protocolType: "http" + port: "8000" + path: "metrics" + targetMetric: "gpu_cache_usage_perc" + targetValue: "0.5" + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file From b6455354bf3849f6075f59d7d7362b59fc3bda31 Mon Sep 17 00:00:00 2001 From: Gangmuk Date: Thu, 13 Feb 2025 10:01:55 -0800 Subject: [PATCH 3/3] Reformat autoscaler yamls based on #665 --- .../autoscaling/deepseek-llm-7b-chat/apa.yaml | 25 ++++++++++--------- .../autoscaling/deepseek-llm-7b-chat/hpa.yaml | 18 ++++++------- .../autoscaling/deepseek-llm-7b-chat/kpa.yaml | 21 ++++++++-------- .../deepseek-llm-7b-chat/optimizer-kpa.yaml | 17 +++++++------ 4 files changed, 42 insertions(+), 39 deletions(-) diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml index d644cc54..a81d1381 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml @@ -1,25 +1,26 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-apa + name: deepseek-llm-7b-chat-v100-apa namespace: default labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize - autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1" - autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2" - apa.autoscaling.aibrix.ai/window: "30s" + annotations: + autoscaling.aibrix.ai/up-fluctuation-tolerance: '0.1' + autoscaling.aibrix.ai/down-fluctuation-tolerance: '0.2' + apa.autoscaling.aibrix.ai/window: 30s spec: - scalingStrategy: "APA" + scalingStrategy: APA minReplicas: 1 - maxReplicas: 10 + maxReplicas: 8 metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" + - metricSourceType: pod + protocolType: http + port: '8000' + path: metrics + targetMetric: gpu_cache_usage_perc + targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 kind: Deployment diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml index 0e644a32..55adfc00 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml @@ -1,22 +1,22 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-hpa + name: deepseek-llm-7b-chat-hpa namespace: default labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize spec: - scalingStrategy: "HPA" + scalingStrategy: HPA minReplicas: 1 - maxReplicas: 10 + maxReplicas: 8 metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "/metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "50" + - metricSourceType: pod + protocolType: http + port: '8000' + path: /metrics + targetMetric: gpu_cache_usage_perc + targetValue: '50' scaleTargetRef: apiVersion: apps/v1 kind: Deployment diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml index c5e7359c..c49d4546 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml @@ -1,23 +1,24 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-kpa + name: deepseek-llm-7b-chat-kpa namespace: default labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize - kpa.autoscaling.aibrix.ai/scale-down-delay: "3m" + annotations: + kpa.autoscaling.aibrix.ai/scale-down-delay: 3m spec: - scalingStrategy: "KPA" + scalingStrategy: KPA minReplicas: 1 - maxReplicas: 10 + maxReplicas: 8 metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" + - metricSourceType: pod + protocolType: http + port: '8000' + path: metrics + targetMetric: gpu_cache_usage_perc + targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 kind: Deployment diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml index 80f55525..e26bd7d8 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml @@ -1,23 +1,24 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-gpu-optimizer + name: deepseek-llm-7b-chat-gpu-optimizer namespace: default labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize + annotations: kpa.autoscaling.aibrix.ai/scale-down-delay: 0s spec: scalingStrategy: KPA minReplicas: 1 - maxReplicas: 10 + maxReplicas: 8 metricsSources: - - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 - metricSourceType: domain - path: /metrics/default/deepseek-llm-7b-chat - protocolType: http - targetMetric: vllm:deployment_replicas - targetValue: "1" + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 + metricSourceType: domain + path: /metrics/default/deepseek-llm-7b-chat + protocolType: http + targetMetric: vllm:deployment_replicas + targetValue: "1" scaleTargetRef: apiVersion: apps/v1 kind: Deployment