Skip to content

Commit

Permalink
update script
Browse files Browse the repository at this point in the history
  • Loading branch information
KuntaiDu committed Jul 25, 2024
1 parent 2dd0b89 commit 43787b1
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 4 deletions.
10 changes: 6 additions & 4 deletions .buildkite/nightly-benchmarks/nightly-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ common_container_settings: &common_container_settings
steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."

- label: "A100 vllm benchmark"
- label: "A100 sglang benchmark"
priority: 100
agents:
queue: A100
Expand All @@ -47,10 +47,12 @@ steps:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:latest
- image: lmsysorg/sglang:latest
<<: *common_container_settings

- label: "A100 sglang benchmark"
- wait

- label: "A100 vllm benchmark"
priority: 100
agents:
queue: A100
Expand All @@ -59,7 +61,7 @@ steps:
podSpec:
<<: *common_pod_spec
containers:
- image: lmsysorg/sglang:latest
- image: vllm/vllm-openai:latest
<<: *common_container_settings

- wait
Expand Down
217 changes: 217 additions & 0 deletions .buildkite/nightly-benchmarks/scripts/run-sgl-nightly.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#!/bin/bash

set -o pipefail

check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}

kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10


# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}

json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}

wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}

run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases

local serving_test_file
serving_test_file=$1

# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')

# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi

# append vllm to the test name
test_name=sgl_$test_name


# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')

# get client and server arguments
server_params=$(echo "$params" | jq -r '.sgl_server_parameters')
client_params=$(echo "$params" | jq -r '.sgl_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"

# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m sglang.launch_server \
--tp $tp \
--model-path $model \
--port $port \
$server_args"
fi

# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &

# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "sgl server is up and running."
else
echo ""
echo "sgl failed to start within the timeout period."
break
fi

# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi

new_test_name=$test_name"_qps_"$qps

client_command="python3 benchmark_serving.py \
--backend sglang \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"

echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"

eval "$client_command"

# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "sgl" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

done

# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}


upload_to_buildkite() {
# upload the benchmarking results to buildkite

# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}

main() {

check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

export CURRENT_LLM_SERVING_ENGINE=sgl
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json

python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite

}

main "$@"

0 comments on commit 43787b1

Please sign in to comment.