diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 3f38cf5137535..b8db594852b64 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -301,6 +301,104 @@ run_serving_tests() {
   kill_gpu_processes
 }
 
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
 
 prepare_dataset() {
 
@@ -350,6 +448,9 @@ main() {
   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
   upload_to_buildkite
 
+  # run genai-perf tests
+  # run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+
 }
 
 main "$@"
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
new file mode 100644
index 0000000000000..7dc66f6796f3d
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -0,0 +1,22 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/requirements-test.in b/requirements-test.in
index 57fddb416317e..c33dc4865767e 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -28,4 +28,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9
 
+genai_perf==0.0.8
+
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c786a1249bddb..4b31ef7ad7c46 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -73,6 +73,8 @@ colorama==0.4.6
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
 cupy-cuda12x==13.3.0
     # via ray
 cycler==0.12.1
@@ -107,6 +109,8 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -128,8 +132,11 @@ fsspec[http]==2024.9.0
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
 genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
@@ -184,6 +191,8 @@ jsonschema==4.23.0
     #   ray
 jsonschema-specifications==2024.10.1
     # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
 kiwisolver==1.4.7
     # via matplotlib
 lazy-loader==0.4
@@ -198,6 +207,8 @@ lm-eval[api]==0.4.4
     # via -r requirements-test.in
 lxml==5.3.0
     # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.9.2
@@ -207,6 +218,8 @@ mbstrdecoder==1.1.3
     #   dataproperty
     #   pytablewriter
     #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
 mistral-common[opencv]==1.5.1
     # via
     #   -r requirements-test.in
@@ -247,6 +260,8 @@ numpy==1.26.4
     #   datasets
     #   decord
     #   evaluate
+    #   fastparquet
+    #   genai-perf
     #   librosa
     #   matplotlib
     #   mistral-common
@@ -254,15 +269,18 @@ numpy==1.26.4
     #   numexpr
     #   opencv-python-headless
     #   pandas
+    #   patsy
     #   peft
     #   rouge-score
     #   sacrebleu
     #   scikit-learn
     #   scipy
     #   soxr
+    #   statsmodels
     #   tensorizer
     #   torchvision
     #   transformers
+    #   tritonclient
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -304,30 +322,39 @@ packaging==24.1
     #   datamodel-code-generator
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   lazy-loader
     #   matplotlib
     #   peft
+    #   plotly
     #   pooch
     #   pytest
     #   pytest-rerunfailures
     #   ray
+    #   statsmodels
     #   transformers
     #   typepy
 pandas==2.2.3
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
 pathspec==0.12.1
     # via black
 pathvalidate==3.2.1
     # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
 peft==0.13.2
     # via
     #   -r requirements-test.in
     #   lm-eval
 pillow==10.4.0
     # via
+    #   genai-perf
     #   matplotlib
     #   mistral-common
     #   sentence-transformers
@@ -336,6 +363,8 @@ platformdirs==4.3.6
     # via
     #   black
     #   pooch
+plotly==5.24.1
+    # via genai-perf
 pluggy==1.5.0
     # via pytest
 pooch==1.8.2
@@ -356,7 +385,9 @@ psutil==6.1.0
 py==1.11.0
     # via pytest-forked
 pyarrow==18.0.0
-    # via datasets
+    # via
+    #   datasets
+    #   genai-perf
 pyasn1==0.6.1
     # via rsa
 pybind11==2.13.6
@@ -369,6 +400,8 @@ pydantic[email]==2.9.2
     #   mistral-common
 pydantic-core==2.23.4
     # via pydantic
+pygments==2.18.0
+    # via rich
 pyparsing==3.2.0
     # via matplotlib
 pytablewriter==1.2.0
@@ -377,14 +410,18 @@ pytest==8.3.3
     # via
     #   -r requirements-test.in
     #   buildkite-test-collector
+    #   genai-perf
     #   pytest-asyncio
     #   pytest-forked
+    #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
 pytest-asyncio==0.24.0
     # via -r requirements-test.in
 pytest-forked==1.6.0
     # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
 pytest-rerunfailures==14.0
     # via -r requirements-test.in
 pytest-shard==0.1.2
@@ -395,6 +432,8 @@ python-dateutil==2.9.0.post0
     #   matplotlib
     #   pandas
     #   typepy
+python-rapidjson==1.20
+    # via tritonclient
 pytz==2024.2
     # via
     #   pandas
@@ -405,9 +444,11 @@ pyyaml==6.0.2
     #   awscli
     #   datamodel-code-generator
     #   datasets
+    #   genai-perf
     #   huggingface-hub
     #   peft
     #   ray
+    #   responses
     #   timm
     #   transformers
 ray[adag]==2.40.0
@@ -434,8 +475,13 @@ requests==2.32.3
     #   mistral-common
     #   pooch
     #   ray
+    #   responses
     #   tiktoken
     #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.20.1
@@ -466,6 +512,7 @@ scipy==1.13.1
     #   librosa
     #   scikit-learn
     #   sentence-transformers
+    #   statsmodels
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -486,6 +533,8 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
 sympy==1.13.1
     # via torch
 tabledata==1.3.3
@@ -495,7 +544,9 @@ tabulate==0.9.0
 tcolorpy==0.1.6
     # via pytablewriter
 tenacity==9.0.0
-    # via lm-eval
+    # via
+    #   lm-eval
+    #   plotly
 tensorizer==2.9.0
     # via -r requirements-test.in
 threadpoolctl==3.5.0
@@ -536,6 +587,7 @@ tqdm-multiprocess==0.0.11
     # via lm-eval
 transformers==4.47.0
     # via
+    #   genai-perf
     #   lm-eval
     #   peft
     #   sentence-transformers
@@ -544,6 +596,8 @@ transformers-stream-generator==0.0.5
     # via -r requirements-test.in
 triton==3.1.0
     # via torch
+tritonclient==2.41.1
+    # via genai-perf
 typepy[datetime]==1.3.2
     # via
     #   dataproperty
@@ -563,6 +617,7 @@ urllib3==1.26.20
     # via
     #   botocore
     #   requests
+    #   responses
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0