Add benchmark scrips for gateway client side changes (#340)

* Add gateway benchmark scripts * Update the consistent traffic client * Add model file
vllm-project · Oct 31, 2024 · d5f8e8d · d5f8e8d
1 parent a1f3117
commit d5f8e8d
Show file tree

Hide file tree

Showing 4 changed files with 609 additions and 0 deletions.
diff --git a/benchmarks/gateway/7b.yaml b/benchmarks/gateway/7b.yaml
@@ -0,0 +1,172 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-coder-7b-instruct
+    model.aibrix.ai/port: "8000"
+  name: deepseek-coder-7b-instruct
+  namespace: aibrix-system
+spec:
+  replicas: 8
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-coder-7b-instruct
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8000"
+        prometheus.io/path: "/metrics"
+      labels:
+        model.aibrix.ai/name: deepseek-coder-7b-instruct
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - /models/deepseek-coder-6.7b-instruct
+            - --served-model-name
+            - deepseek-coder-7b-instruct
+            - --trust-remote-code
+            - --max-model-len
+            - "10240"
+            - --api-key
+            - sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi
+          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.6.2-distributed
+          imagePullPolicy: Always
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 90
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          name: vllm-openai
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          readinessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 90
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          # We need to use dataset cache
+          volumeMounts:
+            - mountPath: /models
+              name: model-hostpath
+            - name: dshm
+              mountPath: /dev/shm
+        - name: aibrix-runtime
+          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.0-rc.4
+          command:
+            - gunicorn
+            - -b
+            - :8080
+            - app:app
+            - -k
+            - uvicorn.workers.UvicornWorker
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          volumeMounts:
+            - mountPath: /models
+              name: model-hostpath
+      initContainers:
+        - name: init-model
+          image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.0-rc.4
+          command:
+            - python
+            - -m
+            - aibrix.downloader
+            - --model-uri
+            - tos://aibrix-artifact-testing/models/deepseek-ai/deepseek-coder-6.7b-instruct/
+            - --local-dir
+            - /models/
+          env:
+            - name: DOWNLOADER_MODEL_NAME
+              value: deepseek-coder-6.7b-instruct
+            - name: DOWNLOADER_NUM_THREADS
+              value: "16"
+            - name: DOWNLOADER_ALLOW_FILE_SUFFIX
+              value: json, safetensors
+            - name: TOS_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: tos-credential
+                  key: TOS_ACCESS_KEY
+            - name: TOS_SECRET_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: tos-credential
+                  key: TOS_SECRET_KEY
+            - name: TOS_ENDPOINT
+              value: tos-cn-beijing.ivolces.com
+            - name: TOS_REGION
+              value: cn-beijing
+          volumeMounts:
+            - mountPath: /models
+              name: model-hostpath
+      volumes:
+        - name: model-hostpath
+          hostPath:
+            path: /root/models
+            type: DirectoryOrCreate
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: "4Gi"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: machine.cluster.vke.volcengine.com/gpu-name
+                    operator: In
+                    values:
+                      - NVIDIA-A10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-coder-7b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8000"
+  name: deepseek-coder-7b-instruct
+  namespace: aibrix-system
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-coder-7b-instruct
+  type: LoadBalancer
diff --git a/benchmarks/gateway/README.md b/benchmarks/gateway/README.md
@@ -0,0 +1,135 @@
+## Gateway Routing benchmark
+
+## Prerequisite
+
+### Test Dataset
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+### Client - Curl
+
+```bash
+curl -v http://localhost:8888/v1/completions \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer sk-any-key" \
+    -d '{
+        "model": "deepseek-coder-7b-instruct",
+        "messages": [{"role": "user", "content": "Say this is a test!"}],
+        "max_tokens": 128
+    }'
+```
+
+### Client - Locust
+
+```
+locust -f benchmark.py --host http://localhost:8887
+```
+
+## Experiments
+
+experiment 1 & 2 should use exact same client setting and they are comparable.
+
+### Experiment 1: gateway overhead (httpRoute) vs k8s service (baseline)
+
+```bash
+kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80
+kubectl port-forward svc/deepseek-coder-7b-instruct 8887:8000 -n aibrix-system
+```
+
+> Note: we can not use port-forward in > 1 pod testing, all the traffic will go into one pod.
+> Change model service and gateway service to LoadBalancer for real testing.
+
+### Experiment 2: Three Routing Strategies
+
+wait until cache ready, manually send some request to activate model.
+
+```bash
+> Note: this is for local testing, feel free to change to Elastic IP later.
+
+# service port-forwarding
+OUTPUT_FILE=k8s-service.jsonl locust -f benchmark.py --host http://localhost:8887 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_httproute.csv --csv-full-history --logfile benchmark_gateway_httproute.log
+
+# gateway port-forwarding
+OUTPUT_FILE=http-route.jsonl locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_httproute.csv --csv-full-history --logfile benchmark_gateway_httproute.log
+
+OUTPUT_FILE=random.jsonl ROUTING_STRATEGY=random locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_random.csv --csv-full-history --logfile benchmark_gateway_random.log
+
+OUTPUT_FILE=least-request.jsonl ROUTING_STRATEGY=least-request locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_least_request.csv --csv-full-history --logfile benchmark_gateway_least_request.log
+
+OUTPUT_FILE=throughput.jsonl ROUTING_STRATEGY=throughput locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_throughput.csv --csv-full-history --logfile benchmark_gateway_throughput.log
+```
+
+## Local Testing
+
+```bash
+make docker-build-plugins
+aibrix/plugins:9bd45a9915b71936ff0001a6fbfc32f10b65e480
+
+k edit deployment aibrix-gateway-plugins
+
+k delete pod aibrix-gateway-plugins-759b87dc65-j9qs8 # commit is exact same, we just need to update once
+```
+
+```bash
+curl  http://localhost:8888/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer any_key" \
+  -d '{
+     "model": "llama2-70b",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+> Note: We do not need model or routing strategy in the header now. this is clean and sdk compatibile.
+
+
+## New Client Testing
+
+```bash
+python client.py \
+--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \
+--endpoint "http://101.126.24.162:8000" \
+--num-prompts 2000 \
+--interval 0.05 \
+--output-file-path "k8s-v2.jsonl"
+```
+
+```bash
+python client.py \
+--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \
+--endpoint "http://101.126.81.102:80" \
+--num-prompts 2000 \
+--interval 0.05 \
+--output-file-path "httproute-v2.jsonl"
+```
+
+update env
+```bash
+python client.py \
+--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \
+--endpoint "http://101.126.81.102:80" \
+--num-prompts 2000 \
+--interval 0.05 \
+--output-file-path "random-v2.jsonl"
+```
+
+```bash
+python client.py \
+--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \
+--endpoint "http://101.126.81.102:80" \
+--num-prompts 2000 \
+--interval 0.05 \
+--output-file-path "least-request-v2.jsonl"
+```
+
+```bash
+python client.py \
+--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \
+--endpoint "http://101.126.81.102:80" \
+--num-prompts 2000 \
+--interval 0.05 \
+--output-file-path "throughput-v2.jsonl"
+```