-
Notifications
You must be signed in to change notification settings - Fork 266
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add benchmark scrips for gateway client side changes (#340)
* Add gateway benchmark scripts * Update the consistent traffic client * Add model file
- Loading branch information
Showing
4 changed files
with
609 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
labels: | ||
model.aibrix.ai/name: deepseek-coder-7b-instruct | ||
model.aibrix.ai/port: "8000" | ||
name: deepseek-coder-7b-instruct | ||
namespace: aibrix-system | ||
spec: | ||
replicas: 8 | ||
selector: | ||
matchLabels: | ||
model.aibrix.ai/name: deepseek-coder-7b-instruct | ||
strategy: | ||
type: Recreate | ||
template: | ||
metadata: | ||
annotations: | ||
prometheus.io/scrape: "true" | ||
prometheus.io/port: "8000" | ||
prometheus.io/path: "/metrics" | ||
labels: | ||
model.aibrix.ai/name: deepseek-coder-7b-instruct | ||
spec: | ||
containers: | ||
- command: | ||
- python3 | ||
- -m | ||
- vllm.entrypoints.openai.api_server | ||
- --host | ||
- "0.0.0.0" | ||
- --port | ||
- "8000" | ||
- --model | ||
- /models/deepseek-coder-6.7b-instruct | ||
- --served-model-name | ||
- deepseek-coder-7b-instruct | ||
- --trust-remote-code | ||
- --max-model-len | ||
- "10240" | ||
- --api-key | ||
- sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi | ||
image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.6.2-distributed | ||
imagePullPolicy: Always | ||
livenessProbe: | ||
failureThreshold: 3 | ||
httpGet: | ||
path: /health | ||
port: 8000 | ||
scheme: HTTP | ||
initialDelaySeconds: 90 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 1 | ||
name: vllm-openai | ||
ports: | ||
- containerPort: 8000 | ||
protocol: TCP | ||
readinessProbe: | ||
failureThreshold: 3 | ||
httpGet: | ||
path: /health | ||
port: 8000 | ||
scheme: HTTP | ||
initialDelaySeconds: 90 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 1 | ||
resources: | ||
limits: | ||
nvidia.com/gpu: "1" | ||
requests: | ||
nvidia.com/gpu: "1" | ||
# We need to use dataset cache | ||
volumeMounts: | ||
- mountPath: /models | ||
name: model-hostpath | ||
- name: dshm | ||
mountPath: /dev/shm | ||
- name: aibrix-runtime | ||
image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.0-rc.4 | ||
command: | ||
- gunicorn | ||
- -b | ||
- :8080 | ||
- app:app | ||
- -k | ||
- uvicorn.workers.UvicornWorker | ||
ports: | ||
- containerPort: 8080 | ||
protocol: TCP | ||
volumeMounts: | ||
- mountPath: /models | ||
name: model-hostpath | ||
initContainers: | ||
- name: init-model | ||
image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.0-rc.4 | ||
command: | ||
- python | ||
- -m | ||
- aibrix.downloader | ||
- --model-uri | ||
- tos://aibrix-artifact-testing/models/deepseek-ai/deepseek-coder-6.7b-instruct/ | ||
- --local-dir | ||
- /models/ | ||
env: | ||
- name: DOWNLOADER_MODEL_NAME | ||
value: deepseek-coder-6.7b-instruct | ||
- name: DOWNLOADER_NUM_THREADS | ||
value: "16" | ||
- name: DOWNLOADER_ALLOW_FILE_SUFFIX | ||
value: json, safetensors | ||
- name: TOS_ACCESS_KEY | ||
valueFrom: | ||
secretKeyRef: | ||
name: tos-credential | ||
key: TOS_ACCESS_KEY | ||
- name: TOS_SECRET_KEY | ||
valueFrom: | ||
secretKeyRef: | ||
name: tos-credential | ||
key: TOS_SECRET_KEY | ||
- name: TOS_ENDPOINT | ||
value: tos-cn-beijing.ivolces.com | ||
- name: TOS_REGION | ||
value: cn-beijing | ||
volumeMounts: | ||
- mountPath: /models | ||
name: model-hostpath | ||
volumes: | ||
- name: model-hostpath | ||
hostPath: | ||
path: /root/models | ||
type: DirectoryOrCreate | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: "4Gi" | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: machine.cluster.vke.volcengine.com/gpu-name | ||
operator: In | ||
values: | ||
- NVIDIA-A10 | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
labels: | ||
model.aibrix.ai/name: deepseek-coder-7b-instruct | ||
prometheus-discovery: "true" | ||
annotations: | ||
prometheus.io/scrape: "true" | ||
prometheus.io/port: "8000" | ||
name: deepseek-coder-7b-instruct | ||
namespace: aibrix-system | ||
spec: | ||
ports: | ||
- name: serve | ||
port: 8000 | ||
protocol: TCP | ||
targetPort: 8000 | ||
- name: http | ||
port: 8080 | ||
protocol: TCP | ||
targetPort: 8080 | ||
selector: | ||
model.aibrix.ai/name: deepseek-coder-7b-instruct | ||
type: LoadBalancer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
## Gateway Routing benchmark | ||
|
||
## Prerequisite | ||
|
||
### Test Dataset | ||
|
||
```bash | ||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
``` | ||
|
||
### Client - Curl | ||
|
||
```bash | ||
curl -v http://localhost:8888/v1/completions \ | ||
-H "Content-Type: application/json" \ | ||
-H "Authorization: Bearer sk-any-key" \ | ||
-d '{ | ||
"model": "deepseek-coder-7b-instruct", | ||
"messages": [{"role": "user", "content": "Say this is a test!"}], | ||
"max_tokens": 128 | ||
}' | ||
``` | ||
|
||
### Client - Locust | ||
|
||
``` | ||
locust -f benchmark.py --host http://localhost:8887 | ||
``` | ||
|
||
## Experiments | ||
|
||
experiment 1 & 2 should use exact same client setting and they are comparable. | ||
|
||
### Experiment 1: gateway overhead (httpRoute) vs k8s service (baseline) | ||
|
||
```bash | ||
kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 | ||
kubectl port-forward svc/deepseek-coder-7b-instruct 8887:8000 -n aibrix-system | ||
``` | ||
|
||
> Note: we can not use port-forward in > 1 pod testing, all the traffic will go into one pod. | ||
> Change model service and gateway service to LoadBalancer for real testing. | ||
### Experiment 2: Three Routing Strategies | ||
|
||
wait until cache ready, manually send some request to activate model. | ||
|
||
```bash | ||
> Note: this is for local testing, feel free to change to Elastic IP later. | ||
|
||
# service port-forwarding | ||
OUTPUT_FILE=k8s-service.jsonl locust -f benchmark.py --host http://localhost:8887 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_httproute.csv --csv-full-history --logfile benchmark_gateway_httproute.log | ||
|
||
# gateway port-forwarding | ||
OUTPUT_FILE=http-route.jsonl locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_httproute.csv --csv-full-history --logfile benchmark_gateway_httproute.log | ||
|
||
OUTPUT_FILE=random.jsonl ROUTING_STRATEGY=random locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_random.csv --csv-full-history --logfile benchmark_gateway_random.log | ||
|
||
OUTPUT_FILE=least-request.jsonl ROUTING_STRATEGY=least-request locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_least_request.csv --csv-full-history --logfile benchmark_gateway_least_request.log | ||
|
||
OUTPUT_FILE=throughput.jsonl ROUTING_STRATEGY=throughput locust -f benchmark.py --host http://localhost:8888 --headless --users 30 --spawn-rate 0.08 --run-time 10m --csv benchmark_gateway_throughput.csv --csv-full-history --logfile benchmark_gateway_throughput.log | ||
``` | ||
|
||
## Local Testing | ||
|
||
```bash | ||
make docker-build-plugins | ||
aibrix/plugins:9bd45a9915b71936ff0001a6fbfc32f10b65e480 | ||
|
||
k edit deployment aibrix-gateway-plugins | ||
|
||
k delete pod aibrix-gateway-plugins-759b87dc65-j9qs8 # commit is exact same, we just need to update once | ||
``` | ||
|
||
```bash | ||
curl http://localhost:8888/v1/chat/completions \ | ||
-H "Content-Type: application/json" \ | ||
-H "Authorization: Bearer any_key" \ | ||
-d '{ | ||
"model": "llama2-70b", | ||
"messages": [{"role": "user", "content": "Say this is a test!"}], | ||
"temperature": 0.7 | ||
}' | ||
``` | ||
|
||
> Note: We do not need model or routing strategy in the header now. this is clean and sdk compatibile. | ||
|
||
## New Client Testing | ||
|
||
```bash | ||
python client.py \ | ||
--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \ | ||
--endpoint "http://101.126.24.162:8000" \ | ||
--num-prompts 2000 \ | ||
--interval 0.05 \ | ||
--output-file-path "k8s-v2.jsonl" | ||
``` | ||
|
||
```bash | ||
python client.py \ | ||
--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \ | ||
--endpoint "http://101.126.81.102:80" \ | ||
--num-prompts 2000 \ | ||
--interval 0.05 \ | ||
--output-file-path "httproute-v2.jsonl" | ||
``` | ||
|
||
update env | ||
```bash | ||
python client.py \ | ||
--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \ | ||
--endpoint "http://101.126.81.102:80" \ | ||
--num-prompts 2000 \ | ||
--interval 0.05 \ | ||
--output-file-path "random-v2.jsonl" | ||
``` | ||
|
||
```bash | ||
python client.py \ | ||
--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \ | ||
--endpoint "http://101.126.81.102:80" \ | ||
--num-prompts 2000 \ | ||
--interval 0.05 \ | ||
--output-file-path "least-request-v2.jsonl" | ||
``` | ||
|
||
```bash | ||
python client.py \ | ||
--dataset-path "/tmp/ShareGPT_V3_unfiltered_cleaned_split.json" \ | ||
--endpoint "http://101.126.81.102:80" \ | ||
--num-prompts 2000 \ | ||
--interval 0.05 \ | ||
--output-file-path "throughput-v2.jsonl" | ||
``` |
Oops, something went wrong.