diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 0000000000000..8350e2705141e
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """
+
+
+ Links for vLLM
+ {wheel}
+
+
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+ print(f"Generated index.html for {args.wheel}")
+ # cloudfront requires escaping the '+' character
+ f.write(
+ template.format(wheel=filename,
+ wheel_html_escaped=filename.replace("+", "%2B")))
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 64ba1b32fb074..708e548727cf5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -65,9 +65,9 @@ steps:
- VLLM_USAGE_SOURCE
- HF_TOKEN
- - block: "Run H100 Benchmark"
- key: block-h100
- depends_on: ~
+ #- block: "Run H100 Benchmark"
+ #key: block-h100
+ #depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 7345dd4e66b29..3c756659a715a 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -23,6 +23,8 @@ wheel="$new_wheel"
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
+normal_wheel="$wheel" # Save the original wheel filename
+
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
- mv -- "$wheel" "$new_wheel"
+ # use cp to keep both files in the artifacts directory
+ cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+ # if $normal_wheel matches cu118, do not upload the index.html
+ echo "Skipping index files for cu118 wheels"
+else
+ # only upload index.html for cu12 wheels (default wheels)
+ aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+ aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+ # if $normal_wheel matches cu118, do not upload the index.html
+ echo "Skipping index files for cu118 wheels"
+else
+ # only upload index.html for cu12 wheels (default wheels)
+ aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ceef6a5fba456..bb7e4d5b244a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,6 +81,8 @@ instance/
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
+docs/source/getting_started/examples/*.md
+!**/*.template.md
# PyBuilder
.pybuilder/
diff --git a/Dockerfile b/Dockerfile
index 0944050f7dfca..153bff9cf565f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
# to run the OpenAI compatible server.
# Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/dev/dockerfile/dockerfile.md and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1
@@ -163,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
- && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
@@ -240,9 +240,9 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ebe226cf6d148..f163edc27cba8 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace
+COPY requirements-build.txt requirements-build.txt
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
- --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
- --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
- --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY . .
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 2924ea4a49f54..94999630bae12 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -10,7 +10,8 @@ set -ex
kill_gpu_processes() {
# kill all processes on GPU.
- pkill -f pt_main_thread
+ pgrep pt_main_thread | xargs -r kill -9
+ pgrep python3 | xargs -r kill -9
sleep 10
# remove vllm config file
@@ -54,7 +55,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
- --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+ --model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@@ -64,7 +65,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
- --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+ --model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@@ -87,7 +88,7 @@ benchmark() {
--port 8100 \
--save-result \
--result-dir $results_folder \
- --result-filename disagg_prefill_2xtp4.json \
+ --result-filename disagg_prefill_tp1.json \
--request-rate "inf"
@@ -105,7 +106,7 @@ benchmark() {
--port 8200 \
--save-result \
--result-dir $results_folder \
- --result-filename disagg_prefill_2xtp4.json \
+ --result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes
@@ -118,7 +119,7 @@ main() {
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
- pip install quart httpx
+ pip install quart httpx datasets
cd "$(dirname "$0")"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index d8d9e976dce76..eb5d891d0d4a5 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -1,13 +1,12 @@
#!/bin/bash
-# Requirement: 8x H100 GPUs.
+# Requirement: 2x GPUs.
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
+# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
+# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
+# Resource: 2x GPU
# Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
@@ -114,7 +113,6 @@ benchmark() {
--request-rate "$qps"
sleep 2
-
}
@@ -123,8 +121,9 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
+ (which lsof) || (apt-get -y install lsof)
- pip install quart httpx matplotlib aiohttp
+ pip install quart httpx matplotlib aiohttp datasets
cd "$(dirname "$0")"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ca2da4cd66d2d..4859c8ac08bea 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,7 +1,7 @@
sphinx==6.2.1
sphinx-book-theme==1.0.1
sphinx-copybutton==0.5.2
-myst-parser==2.0.0
+myst-parser==3.0.1
sphinx-argparse==0.4.0
msgspec
cloudpickle
diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md
new file mode 100644
index 0000000000000..c0c141c5fb7ef
--- /dev/null
+++ b/docs/source/automatic_prefix_caching/apc.md
@@ -0,0 +1,102 @@
+(apc)=
+
+# Introduction
+
+## What is Automatic Prefix Caching
+
+Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+
+```{note}
+Technical details on how vLLM implements APC are in the next page.
+```
+
+## Enabling APC in vLLM
+
+Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+
+```python
+import time
+from vllm import LLM, SamplingParams
+
+
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
+| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON |
+| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK |
+| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW |
+| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ |
+| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE |
+| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY |
+| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC |
+| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK |
+| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC|
+| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ |
+| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE |
+| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA |
+| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB |
+| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK |
+| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD |
+| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ |
+| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE |
+| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA |
+| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON |
+| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK |
+| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA |
+| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ|
+| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE |
+| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO |
+| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC |
+| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK |
+| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA |
+| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
+| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
+"""
+
+
+def get_generation_time(llm, sampling_params, prompts):
+ # time the generation
+ start_time = time.time()
+ output = llm.generate(prompts, sampling_params=sampling_params)
+ end_time = time.time()
+ # print the output and generation time
+ print(f"Output: {output[0].outputs[0].text}")
+ print(f"Generation time: {end_time - start_time} seconds.")
+
+
+# set enable_prefix_caching=True to enable APC
+llm = LLM(
+ model='lmsys/longchat-13b-16k',
+ enable_prefix_caching=True
+)
+
+sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+# Querying the age of John Doe
+get_generation_time(
+ llm,
+ sampling_params,
+ LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+)
+
+# Querying the age of Zack Blue
+# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+get_generation_time(
+ llm,
+ sampling_params,
+ LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+)
+```
+
+## Example workloads
+
+We describe two example workloads, where APC can provide huge performance benefit:
+
+- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+
+## Limits
+
+APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst
deleted file mode 100644
index 0d70c74689bf9..0000000000000
--- a/docs/source/automatic_prefix_caching/apc.rst
+++ /dev/null
@@ -1,110 +0,0 @@
-.. _apc:
-
-Introduction
-============
-
-What is Automatic Prefix Caching
---------------------------------
-
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-
-
-.. note::
-
- Technical details on how vLLM implements APC are in the next page.
-
-
-
-Enabling APC in vLLM
---------------------
-
-Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example:
-
-.. code-block:: python
-
- import time
- from vllm import LLM, SamplingParams
-
-
- # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
- LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
- | ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
- |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
- | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
- | 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON |
- | 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK |
- | 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW |
- | 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ |
- | 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE |
- | 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY |
- | 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC |
- | 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK |
- | 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC|
- | 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ |
- | 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE |
- | 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA |
- | 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB |
- | 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK |
- | 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD |
- | 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ |
- | 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE |
- | 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA |
- | 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON |
- | 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK |
- | 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA |
- | 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ|
- | 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE |
- | 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO |
- | 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC |
- | 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK |
- | 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA |
- | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
- | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
- """
-
-
- def get_generation_time(llm, sampling_params, prompts):
- # time the generation
- start_time = time.time()
- output = llm.generate(prompts, sampling_params=sampling_params)
- end_time = time.time()
- # print the output and generation time
- print(f"Output: {output[0].outputs[0].text}")
- print(f"Generation time: {end_time - start_time} seconds.")
-
-
- # set enable_prefix_caching=True to enable APC
- llm = LLM(
- model='lmsys/longchat-13b-16k',
- enable_prefix_caching=True
- )
-
- sampling_params = SamplingParams(temperature=0, max_tokens=100)
-
- # Querying the age of John Doe
- get_generation_time(
- llm,
- sampling_params,
- LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
- )
-
- # Querying the age of Zack Blue
- # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
- get_generation_time(
- llm,
- sampling_params,
- LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
- )
-
-Example workloads
------------------
-
-We describe two example workloads, where APC can provide huge performance benefit:
-
-- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
-- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
-
-
-Limits
-------
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
new file mode 100644
index 0000000000000..43fa9ee616096
--- /dev/null
+++ b/docs/source/community/meetups.md
@@ -0,0 +1,15 @@
+(meetups)=
+
+# vLLM Meetups
+
+We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+
+- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
+- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
+- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
+- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
+- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
+- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
+- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
+
+We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
deleted file mode 100644
index c87f01aa263b3..0000000000000
--- a/docs/source/community/meetups.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _meetups:
-
-vLLM Meetups
-============
-
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
-
-- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__
-- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__
-- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__
-- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__
-- `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__
-- `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__
-- `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__
-
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e9d9ac68c9560..1fe0474631140 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -51,7 +51,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.rst"]
+exclude_patterns: List[str] = ["**/*.template.md"]
# Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ "
@@ -74,6 +74,35 @@
html_static_path = ["_static"]
html_js_files = ["custom.js"]
+myst_url_schemes = {
+ 'http': None,
+ 'https': None,
+ 'mailto': None,
+ 'ftp': None,
+ "gh-issue": {
+ "url":
+ "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
+ "title": "Issue #{{path}}",
+ "classes": ["github"],
+ },
+ "gh-pr": {
+ "url":
+ "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
+ "title": "Pull Request #{{path}}",
+ "classes": ["github"],
+ },
+ "gh-dir": {
+ "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
+ "title": "{{path}}",
+ "classes": ["github"],
+ },
+ "gh-file": {
+ "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
+ "title": "{{path}}",
+ "classes": ["github"],
+ },
+}
+
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
if READTHEDOCS_VERSION_TYPE == "tag":
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
new file mode 100644
index 0000000000000..6535414a7dca4
--- /dev/null
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -0,0 +1,50 @@
+# Dockerfile
+
+We provide a to construct the image for running an OpenAI compatible server with vLLM.
+More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
+
+The edges of the build graph represent:
+
+- FROM ... dependencies (with a solid line and a full arrow head)
+
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+
+- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+
+ > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+ > :align: center
+ > :alt: query
+ > :width: 100%
+ > ```
+ >
+ > Made using:
+ >
+ > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
+ >
+ > ```bash
+ > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+ > ```
+ >
+ > or in case you want to run it directly with the docker image:
+ >
+ > ```bash
+ > docker run \
+ > --rm \
+ > --user "$(id -u):$(id -g)" \
+ > --workdir /workspace \
+ > --volume "$(pwd)":/workspace \
+ > ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+ > --output png \
+ > --dpi 200 \
+ > --max-label-length 50 \
+ > --filename Dockerfile \
+ > --legend
+ > ```
+ >
+ > (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst
deleted file mode 100644
index 9c17c27aa61bf..0000000000000
--- a/docs/source/contributing/dockerfile/dockerfile.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Dockerfile
-====================
-
-See `here `__ for the main Dockerfile to construct
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__.
-
-Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
-
-- All build stages
-- The default build target (highlighted in grey)
-- External images (with dashed borders)
-
-The edges of the build graph represent:
-
-- FROM ... dependencies (with a solid line and a full arrow head)
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
-- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
-
- .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
- :alt: query
- :width: 100%
- :align: center
-
- Made using: https://github.com/patrickhoefler/dockerfilegraph
-
- Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
-
- .. code:: bash
-
- dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
-
- or in case you want to run it directly with the docker image:
-
- .. code:: bash
-
- docker run \
- --rm \
- --user "$(id -u):$(id -g)" \
- --workdir /workspace \
- --volume "$(pwd)":/workspace \
- ghcr.io/patrickhoefler/dockerfilegraph:alpine \
- --output png \
- --dpi 200 \
- --max-label-length 50 \
- --filename Dockerfile \
- --legend
-
- (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
-
-
\ No newline at end of file
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md
similarity index 51%
rename from docs/source/contributing/overview.rst
rename to docs/source/contributing/overview.md
index 4cea0afdaea74..9dac41cff0bcb 100644
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.md
@@ -1,5 +1,4 @@
-Contributing to vLLM
-=====================
+# Contributing to vLLM
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
@@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
-License
--------
+## License
-See `LICENSE `_.
+See .
-Developing
-----------
+## Developing
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details.
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
+Check out the [building from source](#build-from-source) documentation for details.
-Testing
--------
+## Testing
-.. code-block:: bash
+```bash
+pip install -r requirements-dev.txt
- pip install -r requirements-dev.txt
+# linting and formatting
+bash format.sh
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
- # linting and formatting
- bash format.sh
- # Static type checking
- mypy
- # Unit tests
- pytest tests/
+```{note}
+Currently, the repository does not pass the `mypy` tests.
+```
-.. note:: Currently, the repository does not pass the ``mypy`` tests.
+# Contribution Guidelines
-Contribution Guidelines
-=======================
+## Issues
-Issues
-------
+If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible.
+```{important}
+If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+```
-.. important::
- If you discover a security vulnerability, please follow the instructions `here `_.
-
-Pull Requests & Code Reviews
-----------------------------
+## Pull Requests & Code Reviews
Thank you for your contribution to vLLM! Before submitting the pull request,
please ensure the PR meets the following criteria. This helps vLLM maintain the
code quality and improve the efficiency of the review process.
-DCO and Signed-off-by
-^^^^^^^^^^^^^^^^^^^^^
+### DCO and Signed-off-by
-When contributing changes to this project, you must agree to the `DCO `_.
-Commits must include a ``Signed-off-by:`` header which certifies agreement with
-the terms of the `DCO `_.
+When contributing changes to this project, you must agree to the .
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the DCO.
-Using ``-s`` with ``git commit`` will automatically add this header.
+Using `-s` with `git commit` will automatically add this header.
-PR Title and Classification
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### PR Title and Classification
Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the following:
-- ``[Bugfix]`` for bug fixes.
-- ``[CI/Build]`` for build or continuous integration improvements.
-- ``[Doc]`` for documentation fixes and improvements.
-- ``[Model]`` for adding a new model or improving an existing model. Model name
+- `[Bugfix]` for bug fixes.
+- `[CI/Build]` for build or continuous integration improvements.
+- `[Doc]` for documentation fixes and improvements.
+- `[Model]` for adding a new model or improving an existing model. Model name
should appear in the title.
-- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
- ``LLM`` class, etc.)
-- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
-- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
- ``AsyncLLMEngine``, ``Scheduler``, etc.)
-- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
- appear in the prefix (e.g., ``[Hardware][AMD]``).
-- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
+ `LLM` class, etc.)
+- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
+- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
+ `AsyncLLMEngine`, `Scheduler`, etc.)
+- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
+ appear in the prefix (e.g., `[Hardware][AMD]`).
+- `[Misc]` for PRs that do not fit the above categories. Please use this
sparingly.
-.. note::
- If the PR spans more than one category, please include all relevant prefixes.
+```{note}
+If the PR spans more than one category, please include all relevant prefixes.
+```
-Code Quality
-^^^^^^^^^^^^
+### Code Quality
The PR needs to meet the following code quality standards:
-- We adhere to `Google Python style guide
- `_ and `Google C++ style guide
- `_.
-- Pass all linter checks. Please use `format.sh
- `_ to format your
- code.
+- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+- Pass all linter checks. Please use to format your code.
- The code needs to be well-documented to ensure future contributors can easily
understand the code.
- Include sufficient tests to ensure the project stays correct and robust. This
includes both unit tests and integration tests.
-- Please add documentation to ``docs/source/`` if the PR modifies the
+- Please add documentation to `docs/source/` if the PR modifies the
user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
new features or changes.
-Adding or Changing Kernels
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+### Adding or Changing Kernels
Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
- Make sure custom ops are registered following PyTorch guidelines:
- `Custom C++ and CUDA Operators `_
- and `The Custom Operators Manual `_.
-- Custom operations that return ``Tensors`` require meta-functions.
+ [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
+ and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
+- Custom operations that return `Tensors` require meta-functions.
Meta-functions should be implemented and registered in Python so that dynamic
dims can be handled automatically. See above documents for a description of
meta-functions.
-- Use `torch.library.opcheck() `_
+- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
to test the function registration and meta-function for any registered ops.
- See ``tests/kernels`` for examples.
+ See `tests/kernels` for examples.
- When changing the C++ signature of an existing op, the schema must be updated
to reflect the changes.
- If a new custom type is needed, see the following document:
- `Custom Class Support in PT2 `_.
+ [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
-Notes for Large Changes
-^^^^^^^^^^^^^^^^^^^^^^^
+### Notes for Large Changes
Please keep the changes as concise as possible. For major architectural changes
(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
(RFC) discussing the technical design and justification. Otherwise, we will tag
-it with ``rfc-required`` and might not go through the PR.
+it with `rfc-required` and might not go through the PR.
-What to Expect for the Reviews
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### What to Expect for the Reviews
The goal of the vLLM team is to be a *transparent reviewing machine*. We would
like to make the review process transparent and efficient and make sure no
@@ -150,15 +138,14 @@ review process:
- After the PR is assigned, the reviewer will provide status updates every 2-3
days. If the PR is not reviewed within 7 days, please feel free to ping the
reviewer or the vLLM team.
-- After the review, the reviewer will put an ``action-required`` label on the PR
+- After the review, the reviewer will put an `action-required` label on the PR
if there are changes required. The contributor should address the comments and
ping the reviewer to re-review the PR.
- Please respond to all comments within a reasonable time frame. If a comment
isn't clear or you disagree with a suggestion, feel free to ask for
clarification or discuss the suggestion.
-Thank You
----------
+## Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
All of your contributions help make vLLM a great tool and community for everyone!
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
new file mode 100644
index 0000000000000..46210957c19ec
--- /dev/null
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -0,0 +1,41 @@
+# Profiling vLLM
+
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+
+The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+
+When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+
+```{warning}
+Only enable profiling in a development environment.
+```
+
+Traces can be visualized using .
+
+```{tip}
+Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
+```
+
+```{tip}
+To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+`export VLLM_RPC_TIMEOUT=1800000`
+```
+
+## Example commands and usage
+
+### Offline Inference
+
+Refer to for an example.
+
+### OpenAI Server
+
+```bash
+VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
+```
+
+benchmark_serving.py:
+
+```bash
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
+```
diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst
deleted file mode 100644
index a422b1fcda521..0000000000000
--- a/docs/source/contributing/profiling/profiling_index.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-==============
-Profiling vLLM
-==============
-
-We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
-
-The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set.
-
-When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag.
-
-.. warning::
-
- Only enable profiling in a development environment.
-
-
-Traces can be visualized using https://ui.perfetto.dev/.
-
-.. tip::
-
- Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-
-.. tip::
-
- To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
- Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
- ``export VLLM_RPC_TIMEOUT=1800000``
-
-Example commands and usage:
-===========================
-
-Offline Inference:
-------------------
-
-Refer to `examples/offline_inference_with_profiler.py `_ for an example.
-
-
-OpenAI Server:
---------------
-
-.. code-block:: bash
-
- VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
-
-benchmark_serving.py:
-
-.. code-block:: bash
-
- python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
\ No newline at end of file
diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md
similarity index 54%
rename from docs/source/design/arch_overview.rst
rename to docs/source/design/arch_overview.md
index bc3f509f0a66e..475a3e5fa9ddc 100644
--- a/docs/source/design/arch_overview.rst
+++ b/docs/source/design/arch_overview.md
@@ -1,25 +1,24 @@
-.. _arch_overview:
+(arch-overview)=
-Architecture Overview
-======================
+# Architecture Overview
This document provides an overview of the vLLM architecture.
-.. contents:: Table of Contents
- :local:
- :depth: 2
+```{contents} Table of Contents
+:depth: 2
+:local: true
+```
-Entrypoints
------------
+## Entrypoints
vLLM provides a number of entrypoints for interacting with the system. The
following diagram shows the relationship between them.
-.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
- :alt: Entrypoints Diagram
+```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:alt: Entrypoints Diagram
+```
-LLM Class
-^^^^^^^^^
+### LLM Class
The LLM class provides the primary Python interface for doing offline inference,
which is interacting with a model without using a separate model inference
@@ -27,75 +26,70 @@ server.
Here is a sample of `LLM` class usage:
-.. code-block:: python
+```python
+from vllm import LLM, SamplingParams
- from vllm import LLM, SamplingParams
+# Define a list of input prompts
+prompts = [
+ "Hello, my name is",
+ "The capital of France is",
+ "The largest ocean is",
+]
- # Define a list of input prompts
- prompts = [
- "Hello, my name is",
- "The capital of France is",
- "The largest ocean is",
- ]
+# Define sampling parameters
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
- # Define sampling parameters
- sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Initialize the LLM engine with the OPT-125M model
+llm = LLM(model="facebook/opt-125m")
- # Initialize the LLM engine with the OPT-125M model
- llm = LLM(model="facebook/opt-125m")
+# Generate outputs for the input prompts
+outputs = llm.generate(prompts, sampling_params)
- # Generate outputs for the input prompts
- outputs = llm.generate(prompts, sampling_params)
+# Print the generated outputs
+for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
- # Print the generated outputs
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-More API details can be found in the :doc:`Offline Inference
+More API details can be found in the {doc}`Offline Inference
` section of the API docs.
-The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
-`_.
+The code for the `LLM` class can be found in .
-OpenAI-compatible API server
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### OpenAI-compatible API server
The second primary interface to vLLM is via its OpenAI-compatible API server.
This server can be started using the `vllm serve` command.
-.. code-block:: bash
-
- vllm serve
+```bash
+vllm serve
+```
-The code for the `vllm` CLI can be found in `vllm/scripts.py
-`_.
+The code for the `vllm` CLI can be found in .
Sometimes you may see the API server entrypoint used directly instead of via the
`vllm` CLI command. For example:
-.. code-block:: bash
-
- python -m vllm.entrypoints.openai.api_server --model
+```bash
+python -m vllm.entrypoints.openai.api_server --model
+```
-That code can be found in `vllm/entrypoints/openai/api_server.py
-`_.
+That code can be found in .
-More details on the API server can be found in the :doc:`OpenAI Compatible
+More details on the API server can be found in the {doc}`OpenAI Compatible
Server ` document.
-LLM Engine
-----------
+## LLM Engine
The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
the vLLM system, handling model inference and asynchronous request processing.
-.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
- :alt: LLMEngine Diagram
+```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:alt: LLMEngine Diagram
+```
-LLMEngine
-^^^^^^^^^
+### LLMEngine
The `LLMEngine` class is the core component of the vLLM engine. It is
responsible for receiving requests from clients and generating outputs from the
@@ -105,21 +99,15 @@ processing.
- **Input Processing**: Handles tokenization of input text using the specified
tokenizer.
-
- **Scheduling**: Chooses which requests are processed in each step.
-
- **Model Execution**: Manages the execution of the language model, including
distributed execution across multiple GPUs.
-
- **Output Processing**: Processes the outputs generated by the model, decoding the
token IDs from a language model into human-readable text.
-The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
-
-.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+The code for `LLMEngine` can be found in .
-AsyncLLMEngine
-^^^^^^^^^^^^^^
+### AsyncLLMEngine
The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
It uses `asyncio` to create a background loop that continuously processes
@@ -127,55 +115,46 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
can handle multiple concurrent requests and stream outputs to clients.
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in
-`vllm/entrypoints/api_server.py`_.
-
-.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
+API server that serves as a simpler example in .
-The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
+The code for `AsyncLLMEngine` can be found in .
-.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
-
-Worker
-------
+## Worker
A worker is a process that runs the model inference. vLLM follows the common
practice of using one process to control one accelerator device, such as GPUs.
For example, if we use tensor parallelism of size 2 and pipeline parallelism of
size 2, we will have 4 workers in total. Workers are identified by their
-``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
-``local_rank`` is mainly used for assigning the accelerator device and accessing
+`rank` and `local_rank`. `rank` is used for global orchestration, while
+`local_rank` is mainly used for assigning the accelerator device and accessing
local resources such as the file system and shared memory.
-Model Runner
-------------
+## Model Runner
Every worker has one model runner object, responsible for loading and running
the model. Much of the model execution logic resides here, such as preparing
input tensors and capturing cudagraphs.
-Model
------
+## Model
Every model runner object has one model object, which is the actual
-``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
+`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
configurations affect the class we ultimately get.
-Class Hierarchy
----------------
+## Class Hierarchy
The following figure shows the class hierarchy of vLLM:
- .. figure:: /assets/design/hierarchy.png
- :alt: query
- :width: 100%
- :align: center
+> ```{figure} /assets/design/hierarchy.png
+> :align: center
+> :alt: query
+> :width: 100%
+> ```
There are several important design choices behind this class hierarchy:
-1. **Extensibility**: All classes in the hierarchy accept a configuration object
-containing all the necessary information. The `VllmConfig
-`__
+1\. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
class is the main configuration object that is passed around. The class
hierarchy is quite deep, and every class needs to read the configuration it is
interested in. By encapsulating all configurations in one object, we can easily
@@ -188,7 +167,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't
need to change the constructor of the engine, worker, or model class to pass the
new configuration option.
-2. **Uniformity**: The model runner needs a unified interface to create and
+2\. **Uniformity**: The model runner needs a unified interface to create and
initialize the model. vLLM supports more than 50 types of popular open-source
models. Each model has its own initialization logic. If the constructor
signature varies with models, the model runner does not know how to call the
@@ -200,46 +179,46 @@ of a vision model and a language model. By making the constructor uniform, we
can easily create a vision model and a language model and compose them into a
vision-language model.
-.. note::
-
- To support this change, all vLLM models' signatures have been updated to:
-
- .. code-block:: python
-
- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-
- To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
-
- .. code-block:: python
-
- class MyOldModel(nn.Module):
- def __init__(
- self,
- config,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- lora_config: Optional[LoRAConfig] = None,
- prefix: str = "",
- ) -> None:
- ...
-
- from vllm.config import VllmConfig
- class MyNewModel(MyOldModel):
- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
- config = vllm_config.model_config.hf_config
- cache_config = vllm_config.cache_config
- quant_config = vllm_config.quant_config
- lora_config = vllm_config.lora_config
- super().__init__(config, cache_config, quant_config, lora_config, prefix)
-
- if __version__ >= "0.6.4":
- MyModel = MyNewModel
- else:
- MyModel = MyOldModel
-
- This way, the model can work with both old and new versions of vLLM.
-
-3. **Sharding and Quantization at Initialization**: Certain features require
+````{note}
+To support this change, all vLLM models' signatures have been updated to:
+
+```python
+def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+```
+
+To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+```python
+class MyOldModel(nn.Module):
+ def __init__(
+ self,
+ config,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ lora_config: Optional[LoRAConfig] = None,
+ prefix: str = "",
+ ) -> None:
+ ...
+
+from vllm.config import VllmConfig
+class MyNewModel(MyOldModel):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ config = vllm_config.model_config.hf_config
+ cache_config = vllm_config.cache_config
+ quant_config = vllm_config.quant_config
+ lora_config = vllm_config.lora_config
+ super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+if __version__ >= "0.6.4":
+ MyModel = MyNewModel
+else:
+ MyModel = MyOldModel
+```
+
+This way, the model can work with both old and new versions of vLLM.
+````
+
+3\. **Sharding and Quantization at Initialization**: Certain features require
changing the model weights. For example, tensor parallelism needs to shard the
model weights, and quantization needs to quantize the model weights. There are
two possible ways to implement this feature. One way is to change the model
@@ -252,23 +231,23 @@ initialized, we need to load the full 810GB weights to every GPU and then shard
the weights, leading to a huge memory overhead. Instead, if we shard the weights
during the model initialization, every layer will only create a shard of the
weights it needs, leading to a much smaller memory overhead. The same idea
-applies to quantization. Note that we also add an additional argument ``prefix``
+applies to quantization. Note that we also add an additional argument `prefix`
to the model's constructor so that the model can initialize itself differently
based on the prefix. This is useful for non-uniform quantization, where
-different parts of the model are quantized differently. The ``prefix`` is
-usually an empty string for the top-level model and a string like ``"vision"``
-or ``"language"`` for the sub-models. In general, it matches the name of the
+different parts of the model are quantized differently. The `prefix` is
+usually an empty string for the top-level model and a string like `"vision"`
+or `"language"` for the sub-models. In general, it matches the name of the
module's state dict in the checkpoint file.
One disadvantage of this design is that it is hard to write unit tests for
individual components in vLLM because every component needs to be initialized by
a complete config object. We solve this problem by providing a default
initialization function that creates a default config object with all fields set
-to ``None``. If the component we want to test only cares about a few fields in
+to `None`. If the component we want to test only cares about a few fields in
the config object, we can create a default config object and set the fields we
care about. This way, we can test the component in isolation. Note that many
tests in vLLM are end-to-end tests that test the whole system, so this is not a
big problem.
-In summary, the complete config object ``VllmConfig`` can be treated as an
+In summary, the complete config object `VllmConfig` can be treated as an
engine-level global state that is shared among all vLLM classes.
diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
new file mode 100644
index 0000000000000..99b4cb56424c6
--- /dev/null
+++ b/docs/source/design/huggingface_integration.md
@@ -0,0 +1,36 @@
+(huggingface-integration)=
+
+# Integration with HuggingFace
+
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
+
+Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
+
+1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
+
+ - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+ - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
+ - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
+
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+
+ - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+ - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
+
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
+
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
+
+ - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
+
+This completes the integration between vLLM and HuggingFace.
+
+In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
deleted file mode 100644
index e6c1cea6001ea..0000000000000
--- a/docs/source/design/huggingface_integration.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _huggingface_integration:
-
-Integration with HuggingFace
-===================================
-
-This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
-
-Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
-
-1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process:
-
- - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
-
- - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works.
-
- - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file.
-
-2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation.
-
-3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
-
- - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example.
-
- - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
-
-4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation.
-
-5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs.
-
-Beyond that, there are two more things vLLM depends on HuggingFace for.
-
-1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__.
-
-2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
-
- - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that:
-
-This completes the integration between vLLM and HuggingFace.
-
-In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
new file mode 100644
index 0000000000000..bb16920e3d0c0
--- /dev/null
+++ b/docs/source/design/input_processing/input_processing_pipeline.md
@@ -0,0 +1,19 @@
+(input-processing-pipeline)=
+
+# Input Processing Pipeline
+
+1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `.
+
+ - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `.
+
+ - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst
deleted file mode 100644
index 48abec8f75286..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _input_processing_pipeline:
-
-Input Processing Pipeline
-=========================
-
-1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `.
-
- - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `.
-
- - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
new file mode 100644
index 0000000000000..cb415366e5a66
--- /dev/null
+++ b/docs/source/design/input_processing/model_inputs_index.md
@@ -0,0 +1,43 @@
+(input-processing)=
+
+# Input Processing
+
+```{eval-rst}
+.. currentmodule:: vllm.inputs
+```
+
+Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
+{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+
+Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+## Guides
+
+```{toctree}
+:maxdepth: 1
+
+input_processing_pipeline
+```
+
+## Module Contents
+
+### LLM Engine Inputs
+
+```{eval-rst}
+.. autoclass:: vllm.inputs.DecoderOnlyInputs
+ :members:
+ :show-inheritance:
+```
+
+### Registry
+
+```{eval-rst}
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+```
+
+```{eval-rst}
+.. automodule:: vllm.inputs.registry
+ :members:
+ :show-inheritance:
+```
diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst
deleted file mode 100644
index f0ec1fea15ddb..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. _input_processing:
-
-Input Processing
-================
-
-.. currentmodule:: vllm.inputs
-
-Each model can override parts of vLLM's :ref:`input processing pipeline ` via
-:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-Guides
-++++++
-
-.. toctree::
- :maxdepth: 1
-
- input_processing_pipeline
-
-Module Contents
-+++++++++++++++
-
-LLM Engine Inputs
------------------
-
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
- :members:
- :show-inheritance:
-
-Registry
---------
-
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-
-.. automodule:: vllm.inputs.registry
- :members:
- :show-inheritance:
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
new file mode 100644
index 0000000000000..c21985b36eb3a
--- /dev/null
+++ b/docs/source/design/kernel/paged_attention.md
@@ -0,0 +1,527 @@
+# vLLM Paged Attention
+
+- Currently, vLLM utilizes its own implementation of a multi-head query
+ attention kernel (`csrc/attention/attention_kernels.cu`).
+ This kernel is designed to be compatible with
+ vLLM's paged KV caches, where the key and value cache are stored in
+ separate blocks (note that this block concept differs from the GPU
+ thread block. So in a later document, I will refer to vLLM paged
+ attention block as "block", while refer to GPU thread block as
+ "thread block").
+- To achieve high performance, this kernel relies on a specially
+ designed memory layout and access method, specifically when threads
+ read data from global memory to shared memory. The purpose of this
+ document is to provide a high-level explanation of the kernel
+ implementation step by step, aiding those who wish to learn about the
+ vLLM multi-head query attention kernel. After going through this
+ document, users will likely have a better understanding and feel easier
+ to follow the actual implementation.
+- Please note that this document may not cover all details, such as how
+ to calculate the correct index for the corresponding data or the dot
+ multiplication implementation. However, after reading this document
+ and becoming familiar with the high-level logic flow, it should be
+ easier for you to read the actual code and understand the details.
+
+## Inputs
+
+- The kernel function takes a list of arguments for the current thread
+ to perform its assigned work. The three most important arguments are
+ the input pointers `q`, `k_cache`, and `v_cache`, which point
+ to query, key, and value data on global memory that need to be read
+ and processed. The output pointer `out` points to global memory
+ where the result should be written. These four pointers actually
+ refer to multi-dimensional arrays, but each thread only accesses the
+ portion of data assigned to it. I have omitted all other runtime
+ parameters here for simplicity.
+
+ ```cpp
+ template<
+ typename scalar_t,
+ int HEAD_SIZE,
+ int BLOCK_SIZE,
+ int NUM_THREADS,
+ int PARTITION_SIZE = 0>
+ __device__ void paged_attention_kernel(
+ ... // Other side args.
+ const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+ const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
+ ... // Other side args.
+ )
+ ```
+
+- There are also a list of template arguments above the function
+ signature that are determined during compilation time. `scalar_t`
+ represents the data type of the query, key, and value data elements,
+ such as FP16. `HEAD_SIZE` indicates the number of elements in each
+ head. `BLOCK_SIZE` refers to the number of tokens in each block.
+ `NUM_THREADS` denotes the number of threads in each thread block.
+ `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
+ simplicity, we assume this is 0 and tensor parallel is disabled).
+
+- With these arguments, we need to perform a sequence of preparations.
+ This includes calculating the current head index, block index, and
+ other necessary variables. However, for now, we can ignore these
+ preparations and proceed directly to the actual calculations. It will
+ be easier to understand them once we grasp the entire flow.
+
+## Concepts
+
+- Just before we dive into the calculation flow, I want to describe a
+ few concepts that are needed for later sections. However, you may
+ skip this section and return later if you encounter any confusing
+ terminologies.
+- **Sequence**: A sequence represents a client request. For example,
+ the data pointed to by `q` has a shape of
+ `[num_seqs, num_heads, head_size]`. That represents there are total
+ `num_seqs` of query sequence data are pointed by `q`. Since this
+ kernel is a single query attention kernel, each sequence only has one
+ query token. Hence, the `num_seqs` equals the total number of tokens
+ that are processed in the batch.
+- **Context**: The context consists of the generated tokens from the
+ sequence. For instance, `["What", "is", "your"]` are the context
+ tokens, and the input query token is `"name"`. The model might
+ generate the token `"?"`.
+- **Vec**: The vec is a list of elements that are fetched and
+ calculated together. For query and key data, the vec size
+ (`VEC_SIZE`) is determined so that each thread group can fetch and
+ calculate 16 bytes of data at a time. For value data, the vec size
+ (`V_VEC_SIZE`) is determined so that each thread can fetch and
+ calculate 16 bytes of data at a time. For example, if the
+ `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
+ `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
+- **Thread group**: The thread group is a small group of
+ threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
+ query token and one key token at a time. Each thread handles only a
+ portion of the token data. The total number of elements processed by
+ one thread group is referred as `x`. For example, if the thread
+ group contains 2 threads and the head size is 8, then thread 0
+ handles the query and key elements at index 0, 2, 4, 6, while thread
+ 1 handles the elements at index 1, 3, 5, 7.
+- **Block**: The key and value cache data in vLLM are split into
+ blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
+ of tokens at one head. Each block may contain only a portion of the
+ whole context tokens. For example, if the block size is 16 and the
+ head size is 128, then for one head, one block can store 16 * 128 =
+ 2048 elements.
+- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
+ execute simultaneously on a stream multiprocessor (SM). In this
+ kernel, each warp processes the calculation between one query token
+ and key tokens of one entire block at a time (it may process multiple
+ blocks in multiple iterations). For example, if there are 4 warps and
+ 6 blocks for one context, the assignment would be like warp 0 handles
+ the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+ handles the 2nd block and warp 3 handles the 3rd block.
+- **Thread block**: A thread block is a group of
+ threads(`NUM_THREADS`) that can access the same shared memory.
+ Each thread block contains multiple warps(`NUM_WARPS`), and in
+ this kernel, each thread block processes the calculation between one
+ query token and key tokens of a whole context.
+- **Grid**: A grid is a collection of thread blocks and defines the
+ shape of the collection. In this kernel, the shape is
+ `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
+ block only handles the calculation for one head, one sequence, and
+ one partition.
+
+## Query
+
+- This section will introduce how query data is stored in memory and
+ fetched by each thread. As mentioned above, each thread group fetches
+ one query token data, while each thread itself only handles a part of
+ one query token data. Within each warp, every thread group will fetch
+ the same query token data, but will multiply it with different key
+ token data.
+
+ ```cpp
+ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+ ```
+
+ ```{figure} ../../assets/kernel/query.png
+ :align: center
+ :alt: query
+ :width: 70%
+
+ Query data of one token at one head
+ ```
+
+- Each thread defines its own `q_ptr` which points to the assigned
+ query token data on global memory. For example, if `VEC_SIZE` is 4
+ and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
+ total of 128 elements divided into 128 / 4 = 32 vecs.
+
+ ```{figure} ../../assets/kernel/q_vecs.png
+ :align: center
+ :alt: q_vecs
+ :width: 70%
+
+ `q_vecs` for one thread group
+ ```
+
+ ```cpp
+ __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+ ```
+
+- Next, we need to read the global memory data pointed to by `q_ptr`
+ into shared memory as `q_vecs`. It is important to note that each
+ vecs is assigned to a different row. For example, if the
+ `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
+ while thread 1 handles the 1st row vecs. By reading the query data in
+ this way, neighboring threads like thread 0 and thread 1 can read
+ neighbor memory, achieving the memory coalescing to improve
+ performance.
+
+## Key
+
+- Similar to the "Query" section, this section introduces memory layout
+ and assignment for keys. While each thread group only handle one
+ query token one kernel run, it may handle multiple key tokens across
+ multiple iterations. Meanwhile, each warp will process multiple blocks
+ of key tokens in multiple iterations, ensuring that all context
+ tokens are processed by the entire thread group after the kernel run.
+ In this context, "handle" refers to performing the dot multiplication
+ between query data and key data.
+
+ ```cpp
+ const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+ + kv_head_idx * kv_head_stride
+ + physical_block_offset * x;
+ ```
+
+- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
+ key token at different iterations. As shown above, that `k_ptr`
+ points to key token data based on `k_cache` at assigned block,
+ assigned head and assigned token.
+
+ ```{figure} ../../assets/kernel/key.png
+ :align: center
+ :alt: key
+ :width: 70%
+
+ Key data of all context tokens at one head
+ ```
+
+- The diagram above illustrates the memory layout for key data. It
+ assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
+ 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
+ rectangle represents all the elements for one key token at one head,
+ which will be processed by one thread group. The left half shows the
+ total 16 blocks of key token data for warp 0, while the right half
+ represents the remaining key token data for other warps or
+ iterations. Inside each rectangle, there are a total 32 vecs (128
+ elements for one token) that will be processed by 2 threads (one
+ thread group) separately.
+
+ ```{figure} ../../assets/kernel/k_vecs.png
+ :align: center
+ :alt: k_vecs
+ :width: 70%
+
+ `k_vecs` for one thread
+ ```
+
+ ```cpp
+ K_vec k_vecs[NUM_VECS_PER_THREAD]
+ ```
+
+- Next, we need to read the key token data from `k_ptr` and store
+ them on register memory as `k_vecs`. We use register memory for
+ `k_vecs` because it will only be accessed by one thread once,
+ whereas `q_vecs` will be accessed by multiple threads multiple
+ times. Each `k_vecs` will contain multiple vectors for later
+ calculation. Each vec will be set at each inner iteration. The
+ assignment of vecs allows neighboring threads in a warp to read
+ neighboring memory together, which again promotes the memory
+ coalescing. For instance, thread 0 will read vec 0, while thread 1
+ will read vec 1. In the next inner loop, thread 0 will read vec 2,
+ while thread 1 will read vec 3, and so on.
+
+- You may still be a little confused about the overall flow. Don't
+ worry, please keep reading the next "QK" section. It will illustrate
+ the query and key calculation flow in a clearer and higher-level
+ manner.
+
+## QK
+
+- As shown the pseudo code below, before the entire for loop block, we
+ fetch the query data for one token and store it in `q_vecs`. Then,
+ in the outer for loop, we iterate through different `k_ptrs` that
+ point to different tokens and prepare the `k_vecs` in the inner for
+ loop. Finally, we perform the dot multiplication between the
+ `q_vecs` and each `k_vecs`.
+
+ ```cpp
+ q_vecs = ...
+ for ... {
+ k_ptr = ...
+ for ... {
+ k_vecs[i] = ...
+ }
+ ...
+ float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
+ }
+ ```
+
+- As mentioned before, for each thread, it only fetches part of the
+ query and key token data at a time. However, there will be a cross
+ thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
+ returned here is not just between part of the query and key token dot
+ multiplication, but actually a full result between entire query and
+ key token data.
+
+- For example, if the value of `HEAD_SIZE` is 128 and
+ `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
+ total 64 elements. However, the returned `qk` is actually the
+ result of dot multiplication between 128 query elements and 128 key
+ elements. If you want to learn more about the details of the dot
+ multiplication and reduction, you may refer to the implementation of
+ `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
+ cover it in this document.
+
+## Softmax
+
+- Next, we need to calculate the normalized softmax for all `qk`s,
+ as shown above, where each $x$ represents a `qk`. To do this,
+ we must obtain the reduced value of `qk_max`($m(x)$) and
+ the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
+ should be performed across the entire thread block, encompassing
+ results between the query token and all context key tokens.
+
+ ```{math}
+ :nowrap: true
+
+ \begin{gather*}
+ m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+ \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+ \end{gather*}
+ ```
+
+### `qk_max` and `logits`
+
+- Just right after we get the `qk` result, we can set the temporary
+ `logits` result with `qk` (In the end, the `logits` should
+ store the normalized softmax result). Also we can compare and collect
+ the `qk_max` for all `qk`s that are calculated by current
+ thread group.
+
+ ```cpp
+ if (thread_group_offset == 0) {
+ const bool mask = token_idx >= context_len;
+ logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+ qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+ }
+ ```
+
+- Please note that the `logits` here is on shared memory, so each
+ thread group will set the fields for its own assigned context tokens.
+ Overall, the size of logits should be number of context tokens.
+
+ ```cpp
+ for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+ qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+ }
+
+ if (lane == 0) {
+ red_smem[warp_idx] = qk_max;
+ }
+ ```
+
+- Then we need to get the reduced `qk_max` across each warp. The main
+ idea is to make threads in warp to communicate with each other and
+ get the final max `qk` .
+
+ ```cpp
+ for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+ qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+ }
+ qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+ ```
+
+- Finally, we can get the reduced `qk_max` from whole thread block by
+ compare the `qk_max` from all warps in this thread block. Then we
+ need to broadcast the final result to each thread.
+
+### `exp_sum`
+
+- Similar to `qk_max`, we need to get the reduced sum value from the
+ entire thread block too.
+
+ ```cpp
+ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+ float val = __expf(logits[i] - qk_max);
+ logits[i] = val;
+ exp_sum += val;
+ }
+ ...
+ exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum);
+ ```
+
+- Firstly, sum all exp values from each thread group, and meanwhile,
+ convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
+ Please note, the `qk_max` here is already the max `qk` across the
+ whole thread block. And then we can do reduction for `exp_sum`
+ across whole thread block just like the `qk_max`.
+
+ ```cpp
+ const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+ logits[i] *= inv_sum;
+ }
+ ```
+
+- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
+ the final normalized softmax result as `logits`. This `logits`
+ variable will be used for dot multiplication with the value data in
+ later steps. Now, it should store the normalized softmax result of
+ `qk` for all assigned context tokens.
+
+## Value
+
+```{figure} ../../assets/kernel/value.png
+:align: center
+:alt: value
+:width: 70%
+
+Value data of all context tokens at one head
+```
+
+```{figure} ../../assets/kernel/logits_vec.png
+:align: center
+:alt: logits_vec
+:width: 50%
+
+`logits_vec` for one thread
+```
+
+```{figure} ../../assets/kernel/v_vec.png
+:align: center
+:alt: v_vec
+:width: 70%
+
+List of `v_vec` for one thread
+```
+
+- Now we need to retrieve the value data and perform dot multiplication
+ with `logits`. Unlike query and key, there is no thread group
+ concept for value data. As shown in diagram, different from key token
+ memory layout, elements from the same column correspond to the same
+ value token. For one block of value data, there are `HEAD_SIZE` of
+ rows and `BLOCK_SIZE` of columns that are split into multiple
+ `v_vecs`.
+
+- Each thread always fetches `V_VEC_SIZE` elements from the same
+ `V_VEC_SIZE` of tokens at a time. As a result, a single thread
+ retrieves multiple `v_vec`s from different rows and the same
+ columns through multiple inner iterations. For each `v_vec`, it
+ needs to be dot multiplied with the corresponding `logits_vec`,
+ which is also `V_VEC_SIZE` elements from `logits`. Overall, with
+ multiple inner iterations, each warp will process one block of value
+ tokens. And with multiple outer iterations, the whole context value
+ tokens are processd
+
+ ```cpp
+ float accs[NUM_ROWS_PER_THREAD];
+ for ... { // Iteration over different blocks.
+ logits_vec = ...
+ for ... { // Iteration over different rows.
+ v_vec = ...
+ ...
+ accs[i] += dot(logits_vec, v_vec);
+ }
+ }
+ ```
+
+- As shown in the above pseudo code, in the outer loop, similar to
+ `k_ptr`, `logits_vec` iterates over different blocks and reads
+ `V_VEC_SIZE` elements from `logits`. In the inner loop, each
+ thread reads `V_VEC_SIZE` elements from the same tokens as a
+ `v_vec` and performs dot multiplication. It is important to note
+ that in each inner iteration, the thread fetches different head
+ position elements for the same tokens. The dot result is then
+ accumulated in `accs`. Therefore, each entry of `accs` is mapped
+ to a head position assigned to the current thread.
+
+- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
+ thread fetches 8 value elements for 8 tokens at a time. Each element
+ is from different tokens at the same head position. If `HEAD_SIZE`
+ is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
+ fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
+ a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
+ a whole block of value tokens. And each `accs` in each thread
+ contains 8 elements that accumulated at 8 different head positions.
+ For the thread 0, the `accs` variable will have 8 elements, which
+ are 0th, 32th … 224th elements of a value head that are accumulated
+ from all assigned 8 tokens.
+
+## LV
+
+- Now, we need to perform reduction for `accs` within each warp. This
+ process allows each thread to accumulate the `accs` for the
+ assigned head positions of all tokens in one block.
+
+ ```cpp
+ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+ float acc = accs[i];
+ for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+ acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+ }
+ accs[i] = acc;
+ }
+ ```
+
+- Next, we perform reduction for `accs` across all warps, allowing
+ each thread to have the accumulation of `accs` for the assigned
+ head positions of all context tokens. Please note that each `accs`
+ in every thread only stores the accumulation for a portion of
+ elements of the entire head for all context tokens. However, overall,
+ all results for output have been calculated but are just stored in
+ different thread register memory.
+
+ ```cpp
+ float* out_smem = reinterpret_cast(shared_mem);
+ for (int i = NUM_WARPS; i > 1; i /= 2) {
+ // Upper warps write to shared memory.
+ ...
+ float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+ ...
+ dst[row_idx] = accs[i];
+ }
+
+ // Lower warps update the output.
+ const float* src = &out_smem[warp_idx * HEAD_SIZE];
+ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+ ...
+ accs[i] += src[row_idx];
+ }
+
+ // Write out the accs.
+ }
+ ```
+
+## Output
+
+- Now we can write all of calculated result from local register memory
+ to final output global memory.
+
+ ```cpp
+ scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+ + head_idx * max_num_partitions * HEAD_SIZE
+ + partition_idx * HEAD_SIZE;
+ ```
+
+- First, we need to define the `out_ptr` variable, which points to
+ the start address of the assigned sequence and assigned head.
+
+ ```cpp
+ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+ const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+ if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+ from_float(*(out_ptr + row_idx), accs[i]);
+ }
+ }
+ ```
+
+- Finally, we need to iterate over different assigned head positions
+ and write out the corresponding accumulated result based on the
+ `out_ptr`.
diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst
deleted file mode 100644
index ba4f7a2718158..0000000000000
--- a/docs/source/design/kernel/paged_attention.rst
+++ /dev/null
@@ -1,525 +0,0 @@
-vLLM Paged Attention
-====================
-
-- Currently, vLLM utilizes its own implementation of a multi-head query
- attention kernel (``csrc/attention/attention_kernels.cu``).
- This kernel is designed to be compatible with
- vLLM's paged KV caches, where the key and value cache are stored in
- separate blocks (note that this block concept differs from the GPU
- thread block. So in a later document, I will refer to vLLM paged
- attention block as "block", while refer to GPU thread block as
- "thread block").
-- To achieve high performance, this kernel relies on a specially
- designed memory layout and access method, specifically when threads
- read data from global memory to shared memory. The purpose of this
- document is to provide a high-level explanation of the kernel
- implementation step by step, aiding those who wish to learn about the
- vLLM multi-head query attention kernel. After going through this
- document, users will likely have a better understanding and feel easier
- to follow the actual implementation.
-- Please note that this document may not cover all details, such as how
- to calculate the correct index for the corresponding data or the dot
- multiplication implementation. However, after reading this document
- and becoming familiar with the high-level logic flow, it should be
- easier for you to read the actual code and understand the details.
-
-Inputs
-------
-
-- The kernel function takes a list of arguments for the current thread
- to perform its assigned work. The three most important arguments are
- the input pointers ``q``, ``k_cache``, and ``v_cache``, which point
- to query, key, and value data on global memory that need to be read
- and processed. The output pointer ``out`` points to global memory
- where the result should be written. These four pointers actually
- refer to multi-dimensional arrays, but each thread only accesses the
- portion of data assigned to it. I have omitted all other runtime
- parameters here for simplicity.
-
- .. code:: cpp
-
- template<
- typename scalar_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- int PARTITION_SIZE = 0>
- __device__ void paged_attention_kernel(
- ... // Other side args.
- const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- ... // Other side args.
- )
-
-- There are also a list of template arguments above the function
- signature that are determined during compilation time. ``scalar_t``
- represents the data type of the query, key, and value data elements,
- such as FP16. ``HEAD_SIZE`` indicates the number of elements in each
- head. ``BLOCK_SIZE`` refers to the number of tokens in each block.
- ``NUM_THREADS`` denotes the number of threads in each thread block.
- ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For
- simplicity, we assume this is 0 and tensor parallel is disabled).
-- With these arguments, we need to perform a sequence of preparations.
- This includes calculating the current head index, block index, and
- other necessary variables. However, for now, we can ignore these
- preparations and proceed directly to the actual calculations. It will
- be easier to understand them once we grasp the entire flow.
-
-Concepts
---------
-
-- Just before we dive into the calculation flow, I want to describe a
- few concepts that are needed for later sections. However, you may
- skip this section and return later if you encounter any confusing
- terminologies.
-- **Sequence**: A sequence represents a client request. For example,
- the data pointed to by ``q`` has a shape of
- ``[num_seqs, num_heads, head_size]``. That represents there are total
- ``num_seqs`` of query sequence data are pointed by ``q``. Since this
- kernel is a single query attention kernel, each sequence only has one
- query token. Hence, the ``num_seqs`` equals the total number of tokens
- that are processed in the batch.
-- **Context**: The context consists of the generated tokens from the
- sequence. For instance, ``["What", "is", "your"]`` are the context
- tokens, and the input query token is ``"name"``. The model might
- generate the token ``"?"``.
-- **Vec**: The vec is a list of elements that are fetched and
- calculated together. For query and key data, the vec size
- (``VEC_SIZE``) is determined so that each thread group can fetch and
- calculate 16 bytes of data at a time. For value data, the vec size
- (``V_VEC_SIZE``) is determined so that each thread can fetch and
- calculate 16 bytes of data at a time. For example, if the
- ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the
- ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8.
-- **Thread group**: The thread group is a small group of
- threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one
- query token and one key token at a time. Each thread handles only a
- portion of the token data. The total number of elements processed by
- one thread group is referred as ``x``. For example, if the thread
- group contains 2 threads and the head size is 8, then thread 0
- handles the query and key elements at index 0, 2, 4, 6, while thread
- 1 handles the elements at index 1, 3, 5, 7.
-- **Block**: The key and value cache data in vLLM are split into
- blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``)
- of tokens at one head. Each block may contain only a portion of the
- whole context tokens. For example, if the block size is 16 and the
- head size is 128, then for one head, one block can store 16 \* 128 =
- 2048 elements.
-- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that
- execute simultaneously on a stream multiprocessor (SM). In this
- kernel, each warp processes the calculation between one query token
- and key tokens of one entire block at a time (it may process multiple
- blocks in multiple iterations). For example, if there are 4 warps and
- 6 blocks for one context, the assignment would be like warp 0 handles
- the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
- handles the 2nd block and warp 3 handles the 3rd block.
-- **Thread block**: A thread block is a group of
- threads(\ ``NUM_THREADS``) that can access the same shared memory.
- Each thread block contains multiple warps(\ ``NUM_WARPS``), and in
- this kernel, each thread block processes the calculation between one
- query token and key tokens of a whole context.
-- **Grid**: A grid is a collection of thread blocks and defines the
- shape of the collection. In this kernel, the shape is
- ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread
- block only handles the calculation for one head, one sequence, and
- one partition.
-
-Query
------
-
-- This section will introduce how query data is stored in memory and
- fetched by each thread. As mentioned above, each thread group fetches
- one query token data, while each thread itself only handles a part of
- one query token data. Within each warp, every thread group will fetch
- the same query token data, but will multiply it with different key
- token data.
-
- .. code:: cpp
-
- const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
- .. figure:: ../../assets/kernel/query.png
- :alt: query
- :width: 70%
- :align: center
-
- Query data of one token at one head
-
-- Each thread defines its own ``q_ptr`` which points to the assigned
- query token data on global memory. For example, if ``VEC_SIZE`` is 4
- and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains
- total of 128 elements divided into 128 / 4 = 32 vecs.
-
- .. figure:: ../../assets/kernel/q_vecs.png
- :alt: q_vecs
- :width: 70%
- :align: center
-
- ``q_vecs`` for one thread group
-
- .. code:: cpp
-
- __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
-
-- Next, we need to read the global memory data pointed to by ``q_ptr``
- into shared memory as ``q_vecs``. It is important to note that each
- vecs is assigned to a different row. For example, if the
- ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs,
- while thread 1 handles the 1st row vecs. By reading the query data in
- this way, neighboring threads like thread 0 and thread 1 can read
- neighbor memory, achieving the memory coalescing to improve
- performance.
-
-Key
----
-
-- Similar to the "Query" section, this section introduces memory layout
- and assignment for keys. While each thread group only handle one
- query token one kernel run, it may handle multiple key tokens across
- multiple iterations. Meanwhile, each warp will process multiple blocks
- of key tokens in multiple iterations, ensuring that all context
- tokens are processed by the entire thread group after the kernel run.
- In this context, "handle" refers to performing the dot multiplication
- between query data and key data.
-
- .. code:: cpp
-
- const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride
- + physical_block_offset * x;
-
-- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different
- key token at different iterations. As shown above, that ``k_ptr``
- points to key token data based on ``k_cache`` at assigned block,
- assigned head and assigned token.
-
- .. figure:: ../../assets/kernel/key.png
- :alt: key
- :width: 70%
- :align: center
-
- Key data of all context tokens at one head
-
-- The diagram above illustrates the memory layout for key data. It
- assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is
- 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each
- rectangle represents all the elements for one key token at one head,
- which will be processed by one thread group. The left half shows the
- total 16 blocks of key token data for warp 0, while the right half
- represents the remaining key token data for other warps or
- iterations. Inside each rectangle, there are a total 32 vecs (128
- elements for one token) that will be processed by 2 threads (one
- thread group) separately.
-
- .. figure:: ../../assets/kernel/k_vecs.png
- :alt: k_vecs
- :width: 70%
- :align: center
-
- ``k_vecs`` for one thread
-
- .. code:: cpp
-
- K_vec k_vecs[NUM_VECS_PER_THREAD]
-
-- Next, we need to read the key token data from ``k_ptr`` and store
- them on register memory as ``k_vecs``. We use register memory for
- ``k_vecs`` because it will only be accessed by one thread once,
- whereas ``q_vecs`` will be accessed by multiple threads multiple
- times. Each ``k_vecs`` will contain multiple vectors for later
- calculation. Each vec will be set at each inner iteration. The
- assignment of vecs allows neighboring threads in a warp to read
- neighboring memory together, which again promotes the memory
- coalescing. For instance, thread 0 will read vec 0, while thread 1
- will read vec 1. In the next inner loop, thread 0 will read vec 2,
- while thread 1 will read vec 3, and so on.
-- You may still be a little confused about the overall flow. Don't
- worry, please keep reading the next "QK" section. It will illustrate
- the query and key calculation flow in a clearer and higher-level
- manner.
-
-QK
----
-
-- As shown the pseudo code below, before the entire for loop block, we
- fetch the query data for one token and store it in ``q_vecs``. Then,
- in the outer for loop, we iterate through different ``k_ptrs`` that
- point to different tokens and prepare the ``k_vecs`` in the inner for
- loop. Finally, we perform the dot multiplication between the
- ``q_vecs`` and each ``k_vecs``.
-
- .. code:: cpp
-
- q_vecs = ...
- for ... {
- k_ptr = ...
- for ... {
- k_vecs[i] = ...
- }
- ...
- float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
- }
-
-- As mentioned before, for each thread, it only fetches part of the
- query and key token data at a time. However, there will be a cross
- thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk``
- returned here is not just between part of the query and key token dot
- multiplication, but actually a full result between entire query and
- key token data.
-- For example, if the value of ``HEAD_SIZE`` is 128 and
- ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain
- total 64 elements. However, the returned ``qk`` is actually the
- result of dot multiplication between 128 query elements and 128 key
- elements. If you want to learn more about the details of the dot
- multiplication and reduction, you may refer to the implementation of
- ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not
- cover it in this document.
-
-Softmax
--------
-
-- Next, we need to calculate the normalized softmax for all ``qk``\ s,
- as shown above, where each :math:`x` represents a ``qk``. To do this,
- we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and
- the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction
- should be performed across the entire thread block, encompassing
- results between the query token and all context key tokens.
-
- .. math::
- :nowrap:
-
- \begin{gather*}
- m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
- \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
- \end{gather*}
-
-``qk_max`` and ``logits``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- Just right after we get the ``qk`` result, we can set the temporary
- ``logits`` result with ``qk`` (In the end, the ``logits`` should
- store the normalized softmax result). Also we can compare and collect
- the ``qk_max`` for all ``qk``\ s that are calculated by current
- thread group.
-
- .. code:: cpp
-
- if (thread_group_offset == 0) {
- const bool mask = token_idx >= context_len;
- logits[token_idx - start_token_idx] = mask ? 0.f : qk;
- qk_max = mask ? qk_max : fmaxf(qk_max, qk);
- }
-
-- Please note that the ``logits`` here is on shared memory, so each
- thread group will set the fields for its own assigned context tokens.
- Overall, the size of logits should be number of context tokens.
-
- .. code:: cpp
-
- for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
- qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
- }
-
- if (lane == 0) {
- red_smem[warp_idx] = qk_max;
- }
-
-- Then we need to get the reduced ``qk_max`` across each warp. The main
- idea is to make threads in warp to communicate with each other and
- get the final max ``qk`` .
-
- .. code:: cpp
-
- for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
- qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
- }
- qk_max = VLLM_SHFL_SYNC(qk_max, 0);
-
-- Finally, we can get the reduced ``qk_max`` from whole thread block by
- compare the ``qk_max`` from all warps in this thread block. Then we
- need to broadcast the final result to each thread.
-
-``exp_sum``
-~~~~~~~~~~~
-
-- Similar to ``qk_max``, we need to get the reduced sum value from the
- entire thread block too.
-
- .. code:: cpp
-
- for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
- float val = __expf(logits[i] - qk_max);
- logits[i] = val;
- exp_sum += val;
- }
- ...
- exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum);
-
-- Firstly, sum all exp values from each thread group, and meanwhile,
- convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``.
- Please note, the ``qk_max`` here is already the max ``qk`` across the
- whole thread block. And then we can do reduction for ``exp_sum``
- across whole thread block just like the ``qk_max``.
-
- .. code:: cpp
-
- const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
- for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
- logits[i] *= inv_sum;
- }
-
-- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain
- the final normalized softmax result as ``logits``. This ``logits``
- variable will be used for dot multiplication with the value data in
- later steps. Now, it should store the normalized softmax result of
- ``qk`` for all assigned context tokens.
-
-Value
------
-
-.. figure:: ../../assets/kernel/value.png
- :alt: value
- :width: 70%
- :align: center
-
- Value data of all context tokens at one head
-
-.. figure:: ../../assets/kernel/logits_vec.png
- :alt: logits_vec
- :width: 50%
- :align: center
-
- ``logits_vec`` for one thread
-
-.. figure:: ../../assets/kernel/v_vec.png
- :alt: v_vec
- :width: 70%
- :align: center
-
- List of ``v_vec`` for one thread
-
-- Now we need to retrieve the value data and perform dot multiplication
- with ``logits``. Unlike query and key, there is no thread group
- concept for value data. As shown in diagram, different from key token
- memory layout, elements from the same column correspond to the same
- value token. For one block of value data, there are ``HEAD_SIZE`` of
- rows and ``BLOCK_SIZE`` of columns that are split into multiple
- ``v_vecs``.
-- Each thread always fetches ``V_VEC_SIZE`` elements from the same
- ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread
- retrieves multiple ``v_vec``\ s from different rows and the same
- columns through multiple inner iterations. For each ``v_vec``, it
- needs to be dot multiplied with the corresponding ``logits_vec``,
- which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with
- multiple inner iterations, each warp will process one block of value
- tokens. And with multiple outer iterations, the whole context value
- tokens are processd
-
- .. code:: cpp
-
- float accs[NUM_ROWS_PER_THREAD];
- for ... { // Iteration over different blocks.
- logits_vec = ...
- for ... { // Iteration over different rows.
- v_vec = ...
- ...
- accs[i] += dot(logits_vec, v_vec);
- }
- }
-
-- As shown in the above pseudo code, in the outer loop, similar to
- ``k_ptr``, ``logits_vec`` iterates over different blocks and reads
- ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each
- thread reads ``V_VEC_SIZE`` elements from the same tokens as a
- ``v_vec`` and performs dot multiplication. It is important to note
- that in each inner iteration, the thread fetches different head
- position elements for the same tokens. The dot result is then
- accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped
- to a head position assigned to the current thread.
-- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each
- thread fetches 8 value elements for 8 tokens at a time. Each element
- is from different tokens at the same head position. If ``HEAD_SIZE``
- is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to
- fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are
- a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle
- a whole block of value tokens. And each ``accs`` in each thread
- contains 8 elements that accumulated at 8 different head positions.
- For the thread 0, the ``accs`` variable will have 8 elements, which
- are 0th, 32th … 224th elements of a value head that are accumulated
- from all assigned 8 tokens.
-
-LV
----
-- Now, we need to perform reduction for ``accs`` within each warp. This
- process allows each thread to accumulate the ``accs`` for the
- assigned head positions of all tokens in one block.
-
- .. code:: cpp
-
- for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
- float acc = accs[i];
- for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
- acc += VLLM_SHFL_XOR_SYNC(acc, mask);
- }
- accs[i] = acc;
- }
-
-- Next, we perform reduction for ``accs`` across all warps, allowing
- each thread to have the accumulation of ``accs`` for the assigned
- head positions of all context tokens. Please note that each ``accs``
- in every thread only stores the accumulation for a portion of
- elements of the entire head for all context tokens. However, overall,
- all results for output have been calculated but are just stored in
- different thread register memory.
-
- .. code:: cpp
-
- float* out_smem = reinterpret_cast(shared_mem);
- for (int i = NUM_WARPS; i > 1; i /= 2) {
- // Upper warps write to shared memory.
- ...
- float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
- for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
- ...
- dst[row_idx] = accs[i];
- }
-
- // Lower warps update the output.
- const float* src = &out_smem[warp_idx * HEAD_SIZE];
- for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
- ...
- accs[i] += src[row_idx];
- }
-
- // Write out the accs.
- }
-
-Output
-------
-
-- Now we can write all of calculated result from local register memory
- to final output global memory.
-
- .. code:: cpp
-
- scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE
- + partition_idx * HEAD_SIZE;
-
-- First, we need to define the ``out_ptr`` variable, which points to
- the start address of the assigned sequence and assigned head.
-
- .. code:: cpp
-
- for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
- const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
- if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
- from_float(*(out_ptr + row_idx), accs[i]);
- }
- }
-
-- Finally, we need to iterate over different assigned head positions
- and write out the corresponding accumulated result based on the
- ``out_ptr``.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
new file mode 100644
index 0000000000000..bcccd284879bb
--- /dev/null
+++ b/docs/source/design/multimodal/adding_multimodal_plugin.md
@@ -0,0 +1,16 @@
+(adding-multimodal-plugin)=
+
+# Adding a Multimodal Plugin
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
+
+```{note}
+This article is a work in progress.
+```
+
+% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst
deleted file mode 100644
index b726138f840a3..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _adding_multimodal_plugin:
-
-Adding a Multimodal Plugin
-==========================
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
-
-.. note::
- This article is a work in progress.
-
-..
- TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md
similarity index 61%
rename from docs/source/design/multimodal/multimodal_index.rst
rename to docs/source/design/multimodal/multimodal_index.md
index c6d47f90b62d5..88af07afc7018 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -1,66 +1,83 @@
-.. _multi_modality:
+(multi-modality)=
-Multi-Modality
-==============
+# Multi-Modality
+```{eval-rst}
.. currentmodule:: vllm.multimodal
-
-vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
+```
-Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models `
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following :ref:`this guide `.
+by following [this guide](#adding-multimodal-plugin).
-Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `.
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
-Guides
-++++++
+## Guides
-.. toctree::
- :maxdepth: 1
+```{toctree}
+:maxdepth: 1
- adding_multimodal_plugin
+adding_multimodal_plugin
+```
-Module Contents
-+++++++++++++++
+## Module Contents
+```{eval-rst}
.. automodule:: vllm.multimodal
+```
-Registry
---------
+### Registry
+```{eval-rst}
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
+```
+```{eval-rst}
.. autoclass:: vllm.multimodal.MultiModalRegistry
:members:
:show-inheritance:
+```
-Base Classes
-------------
+### Base Classes
+```{eval-rst}
.. autodata:: vllm.multimodal.NestedTensors
+```
+```{eval-rst}
.. autodata:: vllm.multimodal.BatchedTensorInputs
+```
+```{eval-rst}
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
:members:
:show-inheritance:
+```
+```{eval-rst}
.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+```{eval-rst}
.. autoclass:: vllm.multimodal.MultiModalKwargs
:members:
:show-inheritance:
+```
+```{eval-rst}
.. autoclass:: vllm.multimodal.MultiModalPlugin
:members:
:show-inheritance:
+```
-Image Classes
--------------
+### Image Classes
+```{eval-rst}
.. automodule:: vllm.multimodal.image
:members:
:show-inheritance:
+```
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index b58456ecc6da8..34564413b34f6 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -2,13 +2,14 @@
## Debugging
-Please see the [Debugging
-Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+Please see the [Debugging Tips](#debugging-python-multiprocessing)
page for information on known issues and how to solve them.
## Introduction
-*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+```{important}
+The source code references are to the state of the code at the time of writing in December, 2024.
+```
The use of Python multiprocessing in vLLM is complicated by:
@@ -20,7 +21,7 @@ This document describes how vLLM deals with these challenges.
## Multiprocessing Methods
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
- `spawn` - spawn a new Python process. This will be the default as of Python
3.14.
@@ -82,7 +83,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
Related PRs:
--
+-
## Prior State in v1
@@ -96,7 +97,7 @@ engine core.
-
-
-- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+-
It was off by default for all the reasons mentioned above - compatibility with
dependencies and code using vLLM as a library.
@@ -119,17 +120,17 @@ instruct users to either add a `__main__` guard or to disable multiprocessing.
If that known-failure case occurs, the user will see two messages that explain
what is happening. First, a log message from vLLM:
-```
- WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
- initialized. We must use the `spawn` multiprocessing start method. Setting
- VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
- https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
- for more information.
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+ initialized. We must use the `spawn` multiprocessing start method. Setting
+ VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+ https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+ for more information.
```
Second, Python itself will raise an exception with a nice explanation:
-```
+```console
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
new file mode 100644
index 0000000000000..79aff757518f2
--- /dev/null
+++ b/docs/source/design/plugin_system.md
@@ -0,0 +1,54 @@
+(plugin-system)=
+
+# vLLM's Plugin System
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+## How Plugins Work in vLLM
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+
+## How vLLM Discovers Plugins
+
+vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+```python
+# inside `setup.py` file
+from setuptools import setup
+
+setup(name='vllm_add_dummy_model',
+ version='0.1',
+ packages=['vllm_add_dummy_model'],
+ entry_points={
+ 'vllm.general_plugins':
+ ["register_dummy_model = vllm_add_dummy_model:register"]
+ })
+
+# inside `vllm_add_dummy_model.py` file
+def register():
+ from vllm import ModelRegistry
+
+ if "MyLlava" not in ModelRegistry.get_supported_archs():
+ ModelRegistry.register_model("MyLlava",
+ "vllm_add_dummy_model.my_llava:MyLlava")
+```
+
+For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+
+## What Can Plugins Do?
+
+Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+
+## Guidelines for Writing Plugins
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+## Compatibility Guarantee
+
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst
deleted file mode 100644
index 5a96cc8b3a464..0000000000000
--- a/docs/source/design/plugin_system.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. _plugin_system:
-
-vLLM's Plugin System
-====================
-
-The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
-
-How Plugins Work in vLLM
-------------------------
-
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work.
-
-How vLLM Discovers Plugins
---------------------------
-
-vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-
-.. code-block:: python
-
- # inside `setup.py` file
- from setuptools import setup
-
- setup(name='vllm_add_dummy_model',
- version='0.1',
- packages=['vllm_add_dummy_model'],
- entry_points={
- 'vllm.general_plugins':
- ["register_dummy_model = vllm_add_dummy_model:register"]
- })
-
- # inside `vllm_add_dummy_model.py` file
- def register():
- from vllm import ModelRegistry
-
- if "MyLlava" not in ModelRegistry.get_supported_archs():
- ModelRegistry.register_model("MyLlava",
- "vllm_add_dummy_model.my_llava:MyLlava")
-
-For more information on adding entry points to your package, please check the `official documentation `__.
-
-Every plugin has three parts:
-
-1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins.
-
-2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name.
-
-3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module.
-
-What Can Plugins Do?
---------------------
-
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
-
-Guidelines for Writing Plugins
-------------------------------
-
-- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
-
-Compatibility Guarantee
------------------------
-
-vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md
similarity index 59%
rename from docs/source/dev/engine/async_llm_engine.rst
rename to docs/source/dev/engine/async_llm_engine.md
index 93fc310cb543b..904feaa505164 100644
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.md
@@ -1,6 +1,7 @@
-AsyncLLMEngine
-=================================
+# AsyncLLMEngine
+```{eval-rst}
.. autoclass:: vllm.AsyncLLMEngine
:members:
:show-inheritance:
+```
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md
new file mode 100644
index 0000000000000..701cb95d3be33
--- /dev/null
+++ b/docs/source/dev/engine/engine_index.md
@@ -0,0 +1,17 @@
+# vLLM Engine
+
+```{eval-rst}
+.. automodule:: vllm.engine
+```
+
+```{eval-rst}
+.. currentmodule:: vllm.engine
+```
+
+```{toctree}
+:caption: Engines
+:maxdepth: 2
+
+llm_engine
+async_llm_engine
+```
diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst
deleted file mode 100644
index ba9ae55ddea46..0000000000000
--- a/docs/source/dev/engine/engine_index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-vLLM Engine
-=================================
-
-.. automodule:: vllm.engine
-.. currentmodule:: vllm.engine
-
-.. toctree::
- :maxdepth: 2
- :caption: Engines
-
- llm_engine
- async_llm_engine
-
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md
similarity index 60%
rename from docs/source/dev/engine/llm_engine.rst
rename to docs/source/dev/engine/llm_engine.md
index 0b8c1e219d7c9..d6613ef5562dc 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.md
@@ -1,6 +1,7 @@
-LLMEngine
-=================================
+# LLMEngine
+```{eval-rst}
.. autoclass:: vllm.LLMEngine
:members:
:show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md
similarity index 67%
rename from docs/source/dev/offline_inference/llm.rst
rename to docs/source/dev/offline_inference/llm.md
index 83ba1b6987c6d..9f129d5e41686 100644
--- a/docs/source/dev/offline_inference/llm.rst
+++ b/docs/source/dev/offline_inference/llm.md
@@ -1,6 +1,7 @@
-LLM Class
-=========
+# LLM Class
+```{eval-rst}
.. autoclass:: vllm.LLM
:members:
:show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md
similarity index 78%
rename from docs/source/dev/offline_inference/llm_inputs.rst
rename to docs/source/dev/offline_inference/llm_inputs.md
index 0d47281db485e..21f688a12c536 100644
--- a/docs/source/dev/offline_inference/llm_inputs.rst
+++ b/docs/source/dev/offline_inference/llm_inputs.md
@@ -1,14 +1,19 @@
-LLM Inputs
-==========
+# LLM Inputs
+```{eval-rst}
.. autodata:: vllm.inputs.PromptType
+```
+```{eval-rst}
.. autoclass:: vllm.inputs.TextPrompt
:show-inheritance:
:members:
:member-order: bysource
+```
+```{eval-rst}
.. autoclass:: vllm.inputs.TokensPrompt
:show-inheritance:
:members:
:member-order: bysource
+```
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md
new file mode 100644
index 0000000000000..318a02d8c78df
--- /dev/null
+++ b/docs/source/dev/offline_inference/offline_index.md
@@ -0,0 +1,8 @@
+# Offline Inference
+
+```{toctree}
+:maxdepth: 1
+
+llm
+llm_inputs
+```
diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst
deleted file mode 100644
index 27dfb0e9df90e..0000000000000
--- a/docs/source/dev/offline_inference/offline_index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Offline Inference
-=================================
-
-.. toctree::
- :maxdepth: 1
-
- llm
- llm_inputs
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md
similarity index 55%
rename from docs/source/dev/pooling_params.rst
rename to docs/source/dev/pooling_params.md
index 334e0287aff09..74b2c57443e4b 100644
--- a/docs/source/dev/pooling_params.rst
+++ b/docs/source/dev/pooling_params.md
@@ -1,5 +1,6 @@
-Pooling Parameters
-==================
+# Pooling Parameters
+```{eval-rst}
.. autoclass:: vllm.PoolingParams
:members:
+```
diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md
similarity index 55%
rename from docs/source/dev/sampling_params.rst
rename to docs/source/dev/sampling_params.md
index f645941a6c022..bdc36af5153db 100644
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.md
@@ -1,5 +1,6 @@
-Sampling Parameters
-===================
+# Sampling Parameters
+```{eval-rst}
.. autoclass:: vllm.SamplingParams
:members:
+```
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 79b49a186236a..aef32f7559f74 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -15,18 +15,12 @@ def fix_case(text: str) -> str:
return text
-def underline(title: str, character: str = "=") -> str:
- return f"{title}\n{character * len(title)}"
-
-
def generate_title(filename: str) -> str:
# Turn filename into a title
title = filename.replace("_", " ").title()
# Handle acronyms and names
title = fix_case(title)
- # Underline title
- title = underline(title)
- return title
+ return f"# {title}"
def generate_examples():
@@ -38,24 +32,23 @@ def generate_examples():
# Destination paths
doc_dir = root_dir / "docs/source/getting_started/examples"
- doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+ doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
# Generate the example docs for each example script
for script_path, doc_path in zip(script_paths, doc_paths):
- script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
# Make script_path relative to doc_path and call it include_path
include_path = '../../../..' / script_path.relative_to(root_dir)
content = (f"{generate_title(doc_path.stem)}\n\n"
- f"Source {script_url}.\n\n"
- f".. literalinclude:: {include_path}\n"
- " :language: python\n"
- " :linenos:\n")
+ f"Source: .\n\n"
+ f"```{{literalinclude}} {include_path}\n"
+ ":language: python\n"
+ ":linenos:\n```")
with open(doc_path, "w+") as f:
f.write(content)
# Generate the toctree for the example scripts
- with open(doc_dir / "examples_index.template.rst") as f:
+ with open(doc_dir / "examples_index.template.md") as f:
examples_index = f.read()
- with open(doc_dir / "examples_index.rst", "w+") as f:
- example_docs = "\n ".join(path.stem for path in script_paths)
+ with open(doc_dir / "examples_index.md", "w+") as f:
+ example_docs = "\n".join(path.stem + ".md" for path in script_paths)
f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md
new file mode 100644
index 0000000000000..6d01efbbf8828
--- /dev/null
+++ b/docs/source/getting_started/amd-installation.md
@@ -0,0 +1,163 @@
+(installation-rocm)=
+
+# Installation with ROCm
+
+vLLM supports AMD GPUs with ROCm 6.2.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- ROCm 6.2
+
+Installation options:
+
+1. [Build from source with docker](#build-from-source-docker-rocm)
+2. [Build from source](#build-from-source-rocm)
+
+(build-from-source-docker-rocm)=
+
+## Option 1: Build from source with docker (recommended)
+
+You can build and install vLLM from source.
+
+First, build a docker image from and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+ "features": {
+ "buildkit": true
+ }
+}
+```
+
+ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
+- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
+- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
+- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+```console
+$ docker run -it \
+ --network=host \
+ --group-add=video \
+ --ipc=host \
+ --cap-add=SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --device /dev/kfd \
+ --device /dev/dri \
+ -v :/app/model \
+ vllm-rocm \
+ bash
+```
+
+Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+(build-from-source-rocm)=
+
+## Option 2: Build from source
+
+0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
+
+- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
+- [PyTorch](https://pytorch.org/)
+
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+
+1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
+
+Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+
+```console
+$ python3 -m pip install ninja cmake wheel pybind11
+$ pip uninstall -y triton
+$ git clone https://github.com/OpenAI/triton.git
+$ cd triton
+$ git checkout e192dba
+$ cd python
+$ pip3 install .
+$ cd ../..
+```
+
+```{note}
+- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+```
+
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
+
+Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+Alternatively, wheels intended for vLLM use can be accessed under the releases.
+
+For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+
+```console
+$ git clone https://github.com/ROCm/flash-attention.git
+$ cd flash-attention
+$ git checkout 3cea2fb
+$ git submodule update --init
+$ GPU_ARCHS="gfx90a" python3 setup.py install
+$ cd ..
+```
+
+```{note}
+- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+```
+
+3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
+
+```bash
+$ pip install --upgrade pip
+
+# Install PyTorch
+$ pip uninstall torch -y
+$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+
+# Build & install AMD SMI
+$ pip install /opt/rocm/share/amd_smi
+
+# Install dependencies
+$ pip install --upgrade numba scipy huggingface-hub[cli]
+$ pip install "numpy<2"
+$ pip install -r requirements-rocm.txt
+
+# Build vLLM for MI210/MI250/MI300.
+$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+$ python3 setup.py develop
+```
+
+This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+
+```{tip}
+- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+```
+
+```{tip}
+- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+ For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+```
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
deleted file mode 100644
index ece5d785e0c65..0000000000000
--- a/docs/source/getting_started/amd-installation.rst
+++ /dev/null
@@ -1,178 +0,0 @@
-.. _installation_rocm:
-
-Installation with ROCm
-======================
-
-vLLM supports AMD GPUs with ROCm 6.2.
-
-Requirements
-------------
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.2
-
-Installation options:
-
-#. :ref:`Build from source with docker `
-#. :ref:`Build from source `
-
-.. _build_from_source_docker_rocm:
-
-Option 1: Build from source with docker (recommended)
------------------------------------------------------
-
-You can build and install vLLM from source.
-
-First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
-
-.. code-block:: console
-
- {
- "features": {
- "buildkit": true
- }
- }
-
-
-`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
-It provides flexibility to customize the build of docker image using the following arguments:
-
-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
-* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target.
-* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c`
-* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
-
-Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
-
-
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
-
-.. code-block:: console
-
- $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
-
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
-
-.. code-block:: console
-
- $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To run the above docker image ``vllm-rocm``, use the below command:
-
-.. code-block:: console
-
- $ docker run -it \
- --network=host \
- --group-add=video \
- --ipc=host \
- --cap-add=SYS_PTRACE \
- --security-opt seccomp=unconfined \
- --device /dev/kfd \
- --device /dev/dri \
- -v :/app/model \
- vllm-rocm \
- bash
-
-Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
-
-
-.. _build_from_source_rocm:
-
-Option 2: Build from source
----------------------------
-
-0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
-
-- `ROCm `_
-- `PyTorch `_
-
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_
-
-
-1. Install `Triton flash attention for ROCm `_
-
-Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_
-
- .. code-block:: console
-
- $ python3 -m pip install ninja cmake wheel pybind11
- $ pip uninstall -y triton
- $ git clone https://github.com/OpenAI/triton.git
- $ cd triton
- $ git checkout e192dba
- $ cd python
- $ pip3 install .
- $ cd ../..
-
-.. note::
- - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-
-
-2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_
-
-
-Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_
-Alternatively, wheels intended for vLLM use can be accessed under the releases.
-
-For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`.
-Note to get your gfx architecture, run `rocminfo |grep gfx`.
-
- .. code-block:: console
-
- $ git clone https://github.com/ROCm/flash-attention.git
- $ cd flash-attention
- $ git checkout 3cea2fb
- $ git submodule update --init
- $ GPU_ARCHS="gfx90a" python3 setup.py install
- $ cd ..
-
-.. note::
- - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-
-3. Build vLLM.
-
- For example, vLLM on ROCM 6.2 can be built with the following steps:
-
- .. code-block:: console
-
- $ pip install --upgrade pip
-
- $ # Install PyTorch
- $ pip uninstall torch -y
- $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
-
- $ # Build & install AMD SMI
- $ pip install /opt/rocm/share/amd_smi
-
- $ # Install dependencies
- $ pip install --upgrade numba scipy huggingface-hub[cli]
- $ pip install "numpy<2"
- $ pip install -r requirements-rocm.txt
-
- $ # Build vLLM for MI210/MI250/MI300.
- $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
- $ python3 setup.py develop
-
-
- This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation.
-
-
-.. tip::
-
- - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
- - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention.
- - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-
-
-.. tip::
- - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level.
- For vLLM, please refer to `vLLM performance optimization `_.
-
-
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
new file mode 100644
index 0000000000000..de807e198b4f6
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.md
@@ -0,0 +1,46 @@
+(installation-arm)=
+
+# Installation for ARM CPUs
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+- CPU backend inference capabilities
+- Relevant runtime environment variables
+- Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. [Requirements](#arm-backend-requirements)
+2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
+3. [Building from Source](#build-arm-backend-from-source)
+
+(arm-backend-requirements)=
+
+## Requirements
+
+- **Operating System**: Linux or macOS
+- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Instruction Set Architecture (ISA)**: NEON support is required
+
+(arm-backend-quick-start-dockerfile)=
+
+## Quick Start with Dockerfile
+
+You can quickly set up vLLM on ARM using Docker:
+
+```console
+$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+ --rm \
+ --network=host \
+ --cpuset-cpus= \
+ --cpuset-mems= \
+ vllm-cpu-env
+```
+
+(build-arm-backend-from-source)=
+
+## Building from Source
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
deleted file mode 100644
index 7b457df92c11d..0000000000000
--- a/docs/source/getting_started/arm-installation.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. _installation_arm:
-
-Installation for ARM CPUs
-=========================
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
-
-* CPU backend inference capabilities
-* Relevant runtime environment variables
-* Performance optimization tips
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-Contents:
-
-1. :ref:`Requirements `
-2. :ref:`Quick Start with Dockerfile `
-3. :ref:`Building from Source `
-
-.. _arm_backend_requirements:
-
-Requirements
-------------
-
-* **Operating System**: Linux or macOS
-* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
-* **Instruction Set Architecture (ISA)**: NEON support is required
-
-.. _arm_backend_quick_start_dockerfile:
-
-Quick Start with Dockerfile
----------------------------
-
-You can quickly set up vLLM on ARM using Docker:
-
-.. code-block:: console
-
- $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
- $ docker run -it \
- --rm \
- --network=host \
- --cpuset-cpus= \
- --cpuset-mems= \
- vllm-cpu-env
-
-.. _build_arm_backend_from_source:
-
-Building from Source
---------------------
-
-To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
new file mode 100644
index 0000000000000..b6f181ace6274
--- /dev/null
+++ b/docs/source/getting_started/cpu-installation.md
@@ -0,0 +1,154 @@
+(installation-cpu)=
+
+# Installation with CPU
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
+
+- Tensor Parallel
+- Model Quantization (`INT8 W8A8, AWQ`)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
+
+Table of contents:
+
+1. [Requirements](#cpu-backend-requirements)
+2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
+3. [Build from source](#build-cpu-backend-from-source)
+4. [Related runtime environment variables](#env-intro)
+5. [Intel Extension for PyTorch](#ipex-guidance)
+6. [Performance tips](#cpu-backend-performance-tips)
+
+(cpu-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
+
+(cpu-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+ --rm \
+ --network=host \
+ --cpuset-cpus= \
+ --cpuset-mems= \
+ vllm-cpu-env
+```
+
+(build-cpu-backend-from-source)=
+
+## Build from source
+
+- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get update -y
+$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+- Second, install Python packages for vLLM CPU backend building:
+
+```console
+$ pip install --upgrade pip
+$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+- Finally, build and install vLLM CPU backend:
+
+```console
+$ VLLM_TARGET_DEVICE=cpu python setup.py install
+```
+
+```{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+```
+
+(env-intro)=
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+
+(ipex-guidance)=
+
+## Intel Extension for PyTorch
+
+- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+(cpu-backend-performance-tips)=
+
+## Performance tips
+
+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+$ find / -name *libtcmalloc* # find the dynamic link library path
+$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+$ python examples/offline_inference.py # run vLLM
+```
+
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
+
+```console
+$ export VLLM_CPU_KVCACHE_SPACE=40
+$ export VLLM_CPU_OMP_THREADS_BIND=0-29
+$ vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+```console
+$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
+0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
+1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
+2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
+3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
+4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
+5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
+6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
+7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
+8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
+9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
+10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
+11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
+12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
+13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
+14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
+15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
+
+# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+$ export VLLM_CPU_OMP_THREADS_BIND=0-7
+$ python examples/offline_inference.py
+```
+
+- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
+
+## CPU Backend Considerations
+
+- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
+
+- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
+
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+
+ - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+
+ ```console
+ $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+ ```
+
+ - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
deleted file mode 100644
index 649de1cd9b53c..0000000000000
--- a/docs/source/getting_started/cpu-installation.rst
+++ /dev/null
@@ -1,164 +0,0 @@
-.. _installation_cpu:
-
-Installation with CPU
-========================
-
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
-
-- Tensor Parallel
-- Model Quantization (``INT8 W8A8, AWQ``)
-- Chunked-prefill
-- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
-
-Table of contents:
-
-#. :ref:`Requirements `
-#. :ref:`Quick start using Dockerfile `
-#. :ref:`Build from source `
-#. :ref:`Related runtime environment variables `
-#. :ref:`Intel Extension for PyTorch `
-#. :ref:`Performance tips `
-
-.. _cpu_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
-
-.. _cpu_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
- $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
- $ docker run -it \
- --rm \
- --network=host \
- --cpuset-cpus= \
- --cpuset-mems= \
- vllm-cpu-env
-
-.. _build_cpu_backend_from_source:
-
-Build from source
------------------
-
-- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
- $ sudo apt-get update -y
- $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
- $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-- Second, install Python packages for vLLM CPU backend building:
-
-.. code-block:: console
-
- $ pip install --upgrade pip
- $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
- $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, build and install vLLM CPU backend:
-
-.. code-block:: console
-
- $ VLLM_TARGET_DEVICE=cpu python setup.py install
-
-.. note::
- - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-
- - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
-
-.. _env_intro:
-
-Related runtime environment variables
--------------------------------------
-
-- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
-
-.. _ipex_guidance:
-
-Intel Extension for PyTorch
----------------------------
-
-- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
-
-.. _cpu_backend_performance_tips:
-
-Performance tips
------------------
-
-- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
- $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
- $ find / -name *libtcmalloc* # find the dynamic link library path
- $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
- $ python examples/offline_inference.py # run vLLM
-
-- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
-
-.. code-block:: console
-
- $ export VLLM_CPU_KVCACHE_SPACE=40
- $ export VLLM_CPU_OMP_THREADS_BIND=0-29
- $ vllm serve facebook/opt-125m
-
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
-
-.. code-block:: console
-
- $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
-
- # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
- CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
- 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
- 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
- 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
- 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
- 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
- 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
- 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
- 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
- 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
- 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
- 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
- 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
- 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
- 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
- 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
- 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
-
- # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
- $ export VLLM_CPU_OMP_THREADS_BIND=0-7
- $ python examples/offline_inference.py
-
-- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.
-
-CPU Backend Considerations
---------------------------
-
-- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
-
-- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
-
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
-
- * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
-
- .. code-block:: console
-
- $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
-
-
- * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_.
\ No newline at end of file
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
new file mode 100644
index 0000000000000..3b0029f2e88ce
--- /dev/null
+++ b/docs/source/getting_started/debugging.md
@@ -0,0 +1,200 @@
+(debugging)=
+
+# Debugging Tips
+
+This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+
+```{note}
+Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
+```
+
+## Hangs downloading a model
+
+If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
+It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue.
+
+## Hangs loading a model from disk
+
+If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
+It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
+
+```{note}
+To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+```
+
+## Model is too large
+
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+
+## Enable more logging
+
+If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
+
+- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
+- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+
+## Incorrect network setup
+
+The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
+If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=`.
+
+You might also need to set `export NCCL_SOCKET_IFNAME=` and `export GLOO_SOCKET_IFNAME=` to specify the network interface for the IP address.
+
+## Error near `self.graph.replay()`
+
+If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
+To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+
+## Incorrect hardware/driver
+
+If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
+
+```python
+# Test PyTorch NCCL
+import torch
+import torch.distributed as dist
+dist.init_process_group(backend="nccl")
+local_rank = dist.get_rank() % torch.cuda.device_count()
+torch.cuda.set_device(local_rank)
+data = torch.FloatTensor([1,] * 128).to("cuda")
+dist.all_reduce(data, op=dist.ReduceOp.SUM)
+torch.cuda.synchronize()
+value = data.mean().item()
+world_size = dist.get_world_size()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch NCCL is successful!")
+
+# Test PyTorch GLOO
+gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+cpu_data = torch.FloatTensor([1,] * 128)
+dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+value = cpu_data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch GLOO is successful!")
+
+if world_size <= 1:
+ exit()
+
+# Test vLLM NCCL, with cuda graph
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+# pynccl is enabled by default for 0.6.5+,
+# but for 0.6.4 and below, we need to enable it manually.
+# keep the code for backward compatibility when because people
+# prefer to read the latest documentation.
+pynccl.disabled = False
+
+s = torch.cuda.Stream()
+with torch.cuda.stream(s):
+ data.fill_(1)
+ pynccl.all_reduce(data, stream=s)
+ value = data.mean().item()
+ assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL is successful!")
+
+g = torch.cuda.CUDAGraph()
+with torch.cuda.graph(cuda_graph=g, stream=s):
+ pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+
+data.fill_(1)
+g.replay()
+torch.cuda.current_stream().synchronize()
+value = data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL with cuda graph is successful!")
+
+dist.destroy_process_group(gloo_group)
+dist.destroy_process_group()
+```
+
+If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py
+```
+
+If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+```
+
+If the script runs successfully, you should see the message `sanity check is successful!`.
+
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
+
+```{note}
+A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+
+- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
+
+Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
+```
+
+(debugging-python-multiprocessing)=
+## Python multiprocessing
+
+### `RuntimeError` Exception
+
+If you have seen a warning in your logs like this:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+ initialized. We must use the `spawn` multiprocessing start method. Setting
+ VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+ https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+ for more information.
+```
+
+or an error from Python that looks like this:
+
+```console
+RuntimeError:
+ An attempt has been made to start a new process before the
+ current process has finished its bootstrapping phase.
+
+ This probably means that you are not using fork to start your
+ child processes and you have forgotten to use the proper idiom
+ in the main module:
+
+ if __name__ == '__main__':
+ freeze_support()
+ ...
+
+ The "freeze_support()" line can be omitted if the program
+ is not going to be frozen to produce an executable.
+
+ To fix this issue, refer to the "Safe importing of main module"
+ section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+then you must update your Python code to guard usage of `vllm` behind a `if
+__name__ == '__main__':` block. For example, instead of this:
+
+```python
+import vllm
+
+llm = vllm.LLM(...)
+```
+
+try this instead:
+
+```python
+if __name__ == '__main__':
+ import vllm
+
+ llm = vllm.LLM(...)
+```
+
+## Known Issues
+
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
deleted file mode 100644
index 7f36d65a227f0..0000000000000
--- a/docs/source/getting_started/debugging.rst
+++ /dev/null
@@ -1,202 +0,0 @@
-.. _debugging:
-
-===============
-Debugging Tips
-===============
-
-This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible.
-
-.. note::
-
- Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-
-Hangs downloading a model
-----------------------------------------
-If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
-It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
-
-Hangs loading a model from disk
-----------------------------------------
-If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
-It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
-
-.. note::
-
- To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-
-Model is too large
-----------------------------------------
-If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-
-Enable more logging
-----------------------------------------
-If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
-
-- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
-- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
-- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
-- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
-
-Incorrect network setup
-----------------------------------------
-The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one.
-If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``.
-
-You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address.
-
-Error near ``self.graph.replay()``
-----------------------------------------
-If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph.
-To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
-
-Incorrect hardware/driver
-----------------------------------------
-If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
-
-.. code-block:: python
-
- # Test PyTorch NCCL
- import torch
- import torch.distributed as dist
- dist.init_process_group(backend="nccl")
- local_rank = dist.get_rank() % torch.cuda.device_count()
- torch.cuda.set_device(local_rank)
- data = torch.FloatTensor([1,] * 128).to("cuda")
- dist.all_reduce(data, op=dist.ReduceOp.SUM)
- torch.cuda.synchronize()
- value = data.mean().item()
- world_size = dist.get_world_size()
- assert value == world_size, f"Expected {world_size}, got {value}"
-
- print("PyTorch NCCL is successful!")
-
- # Test PyTorch GLOO
- gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
- cpu_data = torch.FloatTensor([1,] * 128)
- dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
- value = cpu_data.mean().item()
- assert value == world_size, f"Expected {world_size}, got {value}"
-
- print("PyTorch GLOO is successful!")
-
- if world_size <= 1:
- exit()
-
- # Test vLLM NCCL, with cuda graph
- from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-
- pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
- # pynccl is enabled by default for 0.6.5+,
- # but for 0.6.4 and below, we need to enable it manually.
- # keep the code for backward compatibility when because people
- # prefer to read the latest documentation.
- pynccl.disabled = False
-
- s = torch.cuda.Stream()
- with torch.cuda.stream(s):
- data.fill_(1)
- pynccl.all_reduce(data, stream=s)
- value = data.mean().item()
- assert value == world_size, f"Expected {world_size}, got {value}"
-
- print("vLLM NCCL is successful!")
-
- g = torch.cuda.CUDAGraph()
- with torch.cuda.graph(cuda_graph=g, stream=s):
- pynccl.all_reduce(data, stream=torch.cuda.current_stream())
-
- data.fill_(1)
- g.replay()
- torch.cuda.current_stream().synchronize()
- value = data.mean().item()
- assert value == world_size, f"Expected {world_size}, got {value}"
-
- print("vLLM NCCL with cuda graph is successful!")
-
- dist.destroy_process_group(gloo_group)
- dist.destroy_process_group()
-
-If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
-
-.. code-block:: console
-
- $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py
-
-If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
-
-.. code-block:: console
-
- $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
-
-If the script runs successfully, you should see the message ``sanity check is successful!``.
-
-If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
-
-.. note::
-
- A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
-
- - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
- - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
-
- Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
-
-Python multiprocessing
-----------------------
-
-`RuntimeError` Exception
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you have seen a warning in your logs like this:
-
-.. code-block:: console
-
- WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
- initialized. We must use the `spawn` multiprocessing start method. Setting
- VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
- https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
- for more information.
-
-or an error from Python that looks like this:
-
-.. code-block:: console
-
- RuntimeError:
- An attempt has been made to start a new process before the
- current process has finished its bootstrapping phase.
-
- This probably means that you are not using fork to start your
- child processes and you have forgotten to use the proper idiom
- in the main module:
-
- if __name__ == '__main__':
- freeze_support()
- ...
-
- The "freeze_support()" line can be omitted if the program
- is not going to be frozen to produce an executable.
-
- To fix this issue, refer to the "Safe importing of main module"
- section in https://docs.python.org/3/library/multiprocessing.html
-
-then you must update your Python code to guard usage of ``vllm`` behind a ``if
-__name__ == '__main__':`` block. For example, instead of this:
-
-.. code-block:: python
-
- import vllm
-
- llm = vllm.LLM(...)
-
-try this instead:
-
-.. code-block:: python
-
- if __name__ == '__main__':
- import vllm
-
- llm = vllm.LLM(...)
-
-Known Issues
-----------------------------------------
-- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_.
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
new file mode 100644
index 0000000000000..de7a91c0ffa48
--- /dev/null
+++ b/docs/source/getting_started/examples/examples_index.template.md
@@ -0,0 +1,8 @@
+# Examples
+
+```{toctree}
+:maxdepth: 1
+:caption: Scripts
+
+%EXAMPLE_DOCS%
+```
\ No newline at end of file
diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst
deleted file mode 100644
index 1b34cccbae15a..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Examples
-=================================
-
-.. toctree::
- :maxdepth: 1
- :caption: Scripts
-
- %EXAMPLE_DOCS%
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
new file mode 100644
index 0000000000000..acf42f210dffb
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -0,0 +1,386 @@
+# Installation with Intel® Gaudi® AI Accelerators
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+## Requirements and Installation
+
+Please follow the instructions provided in the [Gaudi Installation
+Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the [Optimizing Training Platform
+Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+
+### Requirements
+
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
+
+### Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.hpu -t vllm-hpu-env .
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+```{tip}
+If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+```
+
+### Build from source
+
+#### Environment verification
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+```console
+$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+$ pip list | grep neural # verify that neural_compressor is installed
+```
+
+Refer to [Intel Gaudi Software Stack
+Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+for more details.
+
+#### Run Docker Image
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the [Intel Gaudi
+documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+for more details.
+
+Use the following commands to run a Docker image:
+
+```console
+$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+```
+
+#### Build and Install vLLM
+
+To build and install vLLM from source, run:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python setup.py develop
+```
+
+Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+
+```console
+$ git clone https://github.com/HabanaAI/vllm-fork.git
+$ cd vllm-fork
+$ git checkout habana_main
+$ python setup.py develop
+```
+
+## Supported Features
+
+- [Offline batched inference](#offline-batched-inference)
+- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+ prefill attention, Root Mean Square Layer Normalization, Rotary
+ Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+ for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+
+## Unsupported Features
+
+- Beam search
+- LoRA adapters
+- Quantization
+- Prefill chunking (mixed-batch inferencing)
+
+## Supported Configurations
+
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+
+## Performance Tuning
+
+### Execution modes
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
+
+```{eval-rst}
+.. list-table:: vLLM execution modes
+ :widths: 25 25 50
+ :header-rows: 1
+
+ * - ``PT_HPU_LAZY_MODE``
+ - ``enforce_eager``
+ - execution mode
+ * - 0
+ - 0
+ - torch.compile
+ * - 0
+ - 1
+ - PyTorch eager mode
+ * - 1
+ - 0
+ - HPU Graphs
+ * - 1
+ - 1
+ - PyTorch lazy mode
+```
+
+```{warning}
+In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+```
+
+### Bucketing mechanism
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+```{note}
+Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+```
+
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+```
+INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+```
+
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+```
+min = 2, step = 32, max = 64
+=> ramp_up = (2, 4, 8, 16)
+=> stable = (32, 64)
+=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+```
+
+Example (without ramp-up)
+
+```
+min = 128, step = 128, max = 512
+=> ramp_up = ()
+=> stable = (128, 256, 384, 512)
+=> buckets = ramp_up + stable => (128, 256, 384, 512)
+```
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+```{warning}
+If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+```
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
+
+```{note}
+Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+```
+
+### Warmup
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+```
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+...
+INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+...
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+```
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+
+```{tip}
+Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+```
+
+### HPU Graph capture
+
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
+Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
+With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
+Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+```{note}
+`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+```
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+
+```{note}
+`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+```
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+```
+INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+...
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+```
+
+### Recommended vLLM Parameters
+
+- We recommend running inference on Gaudi 2 with `block_size` of 128
+ for BF16 data type. Using default values (16, 32) might lead to
+ sub-optimal performance due to Matrix Multiplication Engine
+ under-utilization (see [Gaudi
+ Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- For max throughput on Llama 7B, we recommend running with batch size
+ of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+ If you encounter out-of-memory issues, see troubleshooting section.
+
+### Environment variables
+
+**Diagnostic and profiling knobs:**
+
+- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
+
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
+
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
+
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+ - `{phase}` is either `PROMPT` or `DECODE`
+
+ - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+
+ - `{param}` is either `MIN`, `STEP` or `MAX`
+
+ - Default values:
+
+ - Prompt:
+ : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+ - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+ - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+ - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+ - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+ - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+ - Decode:
+ : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+ - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+ - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+ - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+ - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+ - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+
+## Troubleshooting: Tweaking HPU Graphs
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+- Tweak `gpu_memory_utilization` knob. It will decrease the
+ allocation of KV cache, leaving some headroom for capturing graphs
+ with larger batch size. By default `gpu_memory_utilization` is set
+ to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+ short profiling run. Note that decreasing reduces the number of KV
+ cache blocks you have available, and therefore reduces the effective
+ maximum number of tokens you can handle at a given time.
+- If this method is not efficient, you can disable `HPUGraph`
+ completely. With HPU Graphs disabled, you are trading latency and
+ throughput at lower batches for potentially higher throughput on
+ higher batches. You can do that by adding `--enforce-eager` flag to
+ server (for online inference), or by passing `enforce_eager=True`
+ argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
deleted file mode 100644
index 249e08278ff8f..0000000000000
--- a/docs/source/getting_started/gaudi-installation.rst
+++ /dev/null
@@ -1,402 +0,0 @@
-Installation with Intel® Gaudi® AI Accelerators
-===============================================
-
-This README provides instructions on running vLLM with Intel Gaudi devices.
-
-Requirements and Installation
------------------------------
-
-Please follow the instructions provided in the `Gaudi Installation
-Guide `__
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the `Optimizing Training Platform
-Guide `__.
-
-Requirements
-~~~~~~~~~~~~
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-
-Quick start using Dockerfile
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. code:: console
-
- $ docker build -f Dockerfile.hpu -t vllm-hpu-env .
- $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-
-
-.. tip::
- If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
-
-
-Build from source
-~~~~~~~~~~~~~~~~~
-
-Environment verification
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-.. code:: console
-
- $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
- $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
- $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
- $ pip list | grep neural # verify that neural_compressor is installed
-
-Refer to `Intel Gaudi Software Stack
-Verification `__
-for more details.
-
-Run Docker Image
-^^^^^^^^^^^^^^^^
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the `Intel Gaudi
-documentation `__
-for more details.
-
-Use the following commands to run a Docker image:
-
-.. code:: console
-
- $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
- $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-
-Build and Install vLLM
-^^^^^^^^^^^^^^^^^^^^^^
-
-To build and install vLLM from source, run:
-
-.. code:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ python setup.py develop
-
-
-Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following:
-
-.. code:: console
-
- $ git clone https://github.com/HabanaAI/vllm-fork.git
- $ cd vllm-fork
- $ git checkout habana_main
- $ python setup.py develop
-
-
-Supported Features
-------------------
-
-- `Offline batched
- inference `__
-- Online inference via `OpenAI-Compatible
- Server `__
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
- prefill attention, Root Mean Square Layer Normalization, Rotary
- Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with `HPU Graphs `__
- for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-
-Unsupported Features
---------------------
-
-- Beam search
-- LoRA adapters
-- Quantization
-- Prefill chunking (mixed-batch inferencing)
-
-Supported Configurations
-------------------------
-
-The following configurations have been validated to be function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-- `meta-llama/Llama-2-7b `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Llama-2-7b-chat-hf `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3-8B `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3-8B-Instruct `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3.1-8B `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3.1-8B-Instruct `__
- on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
- datatype with random or greedy sampling
-- `meta-llama/Llama-2-70b `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-- `meta-llama/Llama-2-70b-chat-hf `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3-70B `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3-70B-Instruct `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3.1-70B `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-- `meta-llama/Meta-Llama-3.1-70B-Instruct `__
- with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-
-Performance Tuning
-------------------
-
-Execution modes
-~~~~~~~~~~~~~~~
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.
-
-.. list-table:: vLLM execution modes
- :widths: 25 25 50
- :header-rows: 1
-
- * - ``PT_HPU_LAZY_MODE``
- - ``enforce_eager``
- - execution mode
- * - 0
- - 0
- - torch.compile
- * - 0
- - 1
- - PyTorch eager mode
- * - 1
- - 0
- - HPU Graphs
- * - 1
- - 1
- - PyTorch lazy mode
-
-.. warning::
- In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-
-Bucketing mechanism
-~~~~~~~~~~~~~~~~~~~
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``.
-
-.. note::
- Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-.. code-block::
-
- INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
- INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
- INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
- INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-
-``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
-
-Example (with ramp-up)
-
-.. code-block::
-
- min = 2, step = 32, max = 64
- => ramp_up = (2, 4, 8, 16)
- => stable = (32, 64)
- => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-
-Example (without ramp-up)
-
-.. code-block::
-
- min = 128, step = 128, max = 512
- => ramp_up = ()
- => stable = (128, 256, 384, 512)
- => buckets = ramp_up + stable => (128, 256, 384, 512)
-
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-.. warning::
- If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket.
-
-.. note::
- Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-Warmup
-~~~~~~
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-.. code-block::
-
- INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
- INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
- INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
- ...
- INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
- INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
- INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
- INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
- ...
- INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
- INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-.. tip::
- Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-HPU Graph capture
-~~~~~~~~~~~~~~~~~
-
-`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-.. note::
- ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
-- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
-
-
-.. note::
- ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-.. code-block::
-
- INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
- INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
- INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
- INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
- INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
- INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
- INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
- ...
- INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
- INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
- INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
- ...
- INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
- INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
- ...
- INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
- INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
- INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
- INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
- INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
- INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
- INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
- INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
- INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-
-
-Recommended vLLM Parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- We recommend running inference on Gaudi 2 with ``block_size`` of 128
- for BF16 data type. Using default values (16, 32) might lead to
- sub-optimal performance due to Matrix Multiplication Engine
- under-utilization (see `Gaudi
- Architecture `__).
-- For max throughput on Llama 7B, we recommend running with batch size
- of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
- If you encounter out-of-memory issues, see troubleshooting section.
-
-Environment variables
-~~~~~~~~~~~~~~~~~~~~~
-
-**Diagnostic and profiling knobs:**
-
-- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default.
-- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
-- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
-- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
-- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
-
-**Performance tuning knobs:**
-
-- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
-- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
-- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
-- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
-- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
-- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
- - ``{phase}`` is either ``PROMPT`` or ``DECODE``
- - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
- - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
- - Default values:
-
- - Prompt:
- - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
- - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
- - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
- - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
- - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
- - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
-
- - Decode:
- - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
- - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
- - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
- - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
- - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
- - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
-
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
-- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default
-- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
-
-Troubleshooting: Tweaking HPU Graphs
-------------------------------------
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
-- Tweak ``gpu_memory_utilization`` knob. It will decrease the
- allocation of KV cache, leaving some headroom for capturing graphs
- with larger batch size. By default ``gpu_memory_utilization`` is set
- to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
- short profiling run. Note that decreasing reduces the number of KV
- cache blocks you have available, and therefore reduces the effective
- maximum number of tokens you can handle at a given time.
-
-- If this method is not efficient, you can disable ``HPUGraph``
- completely. With HPU Graphs disabled, you are trading latency and
- throughput at lower batches for potentially higher throughput on
- higher batches. You can do that by adding ``--enforce-eager`` flag to
- server (for online inference), or by passing ``enforce_eager=True``
- argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
new file mode 100644
index 0000000000000..996fb346f43d4
--- /dev/null
+++ b/docs/source/getting_started/installation.md
@@ -0,0 +1,199 @@
+(installation)=
+
+# Installation
+
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Install released versions
+
+You can install vLLM using pip:
+
+```console
+$ # (Recommended) Create a new conda environment.
+$ conda create -n myenv python=3.12 -y
+$ conda activate myenv
+
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm
+```
+
+```{note}
+Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details.
+```
+
+````{note}
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
+We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+
+```console
+$ # Install vLLM with CUDA 11.8.
+$ export VLLM_VERSION=0.6.1.post1
+$ export PYTHON_VERSION=310
+$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
+````
+
+(install-the-latest-code)=
+
+## Install the latest code
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+
+```console
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
+Another way to access the latest code is to use the docker images:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+(build-from-source)=
+
+## Build from source
+
+(python-only-build)=
+
+### Python-only build (without compilation)
+
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
+
+The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
+
+```console
+$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+$ pip install --editable .
+```
+
+You can find more information about vLLM's wheels [above](#install-the-latest-code).
+
+```{note}
+There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
+```
+
+### Full build (with compilation)
+
+If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -e .
+```
+
+```{tip}
+Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
+For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
+As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+
+[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
+The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
+```
+
+#### Use an existing PyTorch installation
+
+There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+
+To build vLLM using an existing PyTorch installation:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python use_existing_torch.py
+$ pip install -r requirements-build.txt
+$ pip install -e . --no-build-isolation
+```
+
+#### Use the local cutlass for compilation
+
+Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
+To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+```
+
+#### Troubleshooting
+
+To avoid your system being overloaded, you can limit the number of compilation jobs
+to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+```console
+$ export MAX_JOBS=6
+$ pip install -e .
+```
+
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
+
+Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
+
+```console
+$ # Use `--ipc=host` to make sure the shared memory is large enough.
+$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+```
+
+If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+
+```console
+$ export CUDA_HOME=/usr/local/cuda
+$ export PATH="${CUDA_HOME}/bin:$PATH"
+```
+
+Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
+
+```console
+$ nvcc --version # verify that nvcc is in your PATH
+$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+```
+
+### Unsupported OS build
+
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
+
+Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
+
+```console
+$ export VLLM_TARGET_DEVICE=empty
+$ pip install -e .
+```
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
deleted file mode 100644
index 9b6cb0e80d60e..0000000000000
--- a/docs/source/getting_started/installation.rst
+++ /dev/null
@@ -1,214 +0,0 @@
-.. _installation:
-
-============
-Installation
-============
-
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
-
-Requirements
-============
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Install released versions
-=========================
-
-You can install vLLM using pip:
-
-.. code-block:: console
-
- $ # (Recommended) Create a new conda environment.
- $ conda create -n myenv python=3.12 -y
- $ conda activate myenv
-
- $ # Install vLLM with CUDA 12.1.
- $ pip install vllm
-
-.. note::
-
- Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue `_ for more details.
-
-.. note::
-
- As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
- We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
-
- .. code-block:: console
-
- $ # Install vLLM with CUDA 11.8.
- $ export VLLM_VERSION=0.6.1.post1
- $ export PYTHON_VERSION=310
- $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-
- In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
- Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-
-
-.. _install-the-latest-code:
-
-Install the latest code
-=======================
-
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command:
-
-.. code-block:: console
-
- $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
-
-.. code-block:: console
-
- $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
- $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Another way to access the latest code is to use the docker images:
-
-.. code-block:: console
-
- $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
- $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
-
-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
-
-The latest code can contain bugs and may not be stable. Please use it with caution.
-
-.. _build_from_source:
-
-Build from source
-=================
-
-.. _python-only-build:
-
-Python-only build (without compilation)
----------------------------------------
-
-If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM:
-
-.. code-block:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ VLLM_USE_PRECOMPILED=1 pip install --editable .
-
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
-
-The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_:
-
-.. code-block:: console
-
- $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
- $ pip install --editable .
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-.. note::
-
- There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
- It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
-
-Full build (with compilation)
------------------------------
-
-If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
-
-.. code-block:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ pip install -e .
-
-.. tip::
-
- Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-
- For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` .
- As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
-
- `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
- The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
-
-
-Use an existing PyTorch installation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
-
-* Building vLLM with PyTorch nightly or a custom PyTorch build.
-* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it.
-
-To build vLLM using an existing PyTorch installation:
-
-.. code-block:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ python use_existing_torch.py
- $ pip install -r requirements-build.txt
- $ pip install -e . --no-build-isolation
-
-
-Use the local cutlass for compilation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
-To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
-
-.. code-block:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
-
-
-Troubleshooting
-~~~~~~~~~~~~~~~
-
-To avoid your system being overloaded, you can limit the number of compilation jobs
-to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
-
-.. code-block:: console
-
- $ export MAX_JOBS=6
- $ pip install -e .
-
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process.
-
-Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
-
-.. code-block:: console
-
- $ # Use `--ipc=host` to make sure the shared memory is large enough.
- $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
-
-If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
-
-.. code-block:: console
-
- $ export CUDA_HOME=/usr/local/cuda
- $ export PATH="${CUDA_HOME}/bin:$PATH"
-
-Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
-
-.. code-block:: console
-
- $ nvcc --version # verify that nvcc is in your PATH
- $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
-
-
-Unsupported OS build
---------------------
-
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
-
-Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
-
-.. code-block:: console
-
- $ export VLLM_TARGET_DEVICE=empty
- $ pip install -e .
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
new file mode 100644
index 0000000000000..d6de5760cc82c
--- /dev/null
+++ b/docs/source/getting_started/neuron-installation.md
@@ -0,0 +1,132 @@
+(installation-neuron)=
+
+# Installation with Neuron
+
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
+Paged Attention and Chunked Prefill are currently in development and will be available soon.
+Data types currently supported in Neuron SDK are FP16 and BF16.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.11
+- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
+- Pytorch 2.0.1/2.1.1
+- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+
+Installation steps:
+
+- [Build from source](#build-from-source-neuron)
+
+ - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
+ - [Step 1. Install drivers and tools](#install-drivers)
+ - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
+ - [Step 3. Install vLLM from source](#install-vllm)
+
+(build-from-source-neuron)=
+
+```{note}
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+```
+
+## Build from source
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+(launch-instances)=
+
+### Step 0. Launch Trn1/Inf2 instances
+
+Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
+
+- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
+- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
+- Select Ubuntu Server 22.04 TLS AMI
+- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
+- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
+
+(install-drivers)=
+
+### Step 1. Install drivers and tools
+
+The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
+
+```console
+# Configure Linux for Neuron repository updates
+. /etc/os-release
+sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`
-
- - :ref:`Step 0. Launch Trn1/Inf2 instances `
- - :ref:`Step 1. Install drivers and tools `
- - :ref:`Step 2. Install transformers-neuronx and its dependencies `
- - :ref:`Step 3. Install vLLM from source `
-
-.. _build_from_source_neuron:
-
-.. note::
-
- The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-
-Build from source
------------------
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-.. _launch_instances:
-
-Step 0. Launch Trn1/Inf2 instances
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_.
-
-- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
-- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_
-- Select Ubuntu Server 22.04 TLS AMI
-- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
-- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance
-
-.. _install_drivers:
-
-Step 1. Install drivers and tools
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-
-.. code-block:: console
-
- # Configure Linux for Neuron repository updates
- . /etc/os-release
- sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
-
-.. code-block:: console
-
- # Install Python venv
- sudo apt-get install -y python3.10-venv g++
-
- # Create Python venv
- python3.10 -m venv aws_neuron_venv_pytorch
-
- # Activate Python venv
- source aws_neuron_venv_pytorch/bin/activate
-
- # Install Jupyter notebook kernel
- pip install ipykernel
- python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
- pip install jupyter notebook
- pip install environment_kernels
-
- # Set pip repository pointing to the Neuron repository
- python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-
- # Install wget, awscli
- python -m pip install wget
- python -m pip install awscli
-
- # Update Neuron Compiler and Framework
- python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
-
-.. _install_vllm:
-
-Step 3. Install vLLM from source
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
-
-.. code-block:: console
-
- $ git clone https://github.com/vllm-project/vllm.git
- $ cd vllm
- $ pip install -U -r requirements-neuron.txt
- $ VLLM_TARGET_DEVICE="neuron" pip install .
-
-If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md
new file mode 100644
index 0000000000000..8b43c0a90447f
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.md
@@ -0,0 +1,104 @@
+(installation-openvino)=
+
+# Installation with OpenVINO
+
+vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
+
+**Table of contents**:
+
+- [Requirements](#openvino-backend-requirements)
+- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
+- [Build from source](#install-openvino-backend-from-source)
+- [Performance tips](#openvino-backend-performance-tips)
+- [Limitations](#openvino-backend-limitations)
+
+(openvino-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Instruction set architecture (ISA) requirement: at least AVX2.
+
+(openvino-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+$ docker run -it --rm vllm-openvino-env
+```
+
+(install-openvino-backend-from-source)=
+
+## Install from source
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+ ```console
+ $ sudo apt-get update -y
+ $ sudo apt-get install python3
+ ```
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+ ```console
+ $ pip install --upgrade pip
+ $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+ ```
+
+- Finally, install vLLM with OpenVINO backend:
+
+ ```console
+ $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+ ```
+
+- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+
+(openvino-backend-performance-tips)=
+
+## Performance tips
+
+### vLLM OpenVINO backend environment variables
+
+- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
+- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as ``
+
+### CPU performance tips
+
+CPU uses the following environment variables to control behavior:
+
+- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
+
+OpenVINO best known configuration for CPU is:
+
+```console
+$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+```
+
+### GPU performance tips
+
+GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
+
+Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
+
+OpenVINO best known configuration for GPU is:
+
+```console
+$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+(openvino-backend-limitations)=
+
+## Limitations
+
+- LoRA serving is not supported.
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
deleted file mode 100644
index 5eeb7c78f7e51..0000000000000
--- a/docs/source/getting_started/openvino-installation.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-.. _installation_openvino:
-
-Installation with OpenVINO
-==========================
-
-vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs `_). OpenVINO vLLM backend supports the following advanced vLLM features:
-
-- Prefix caching (``--enable-prefix-caching``)
-- Chunked prefill (``--enable-chunked-prefill``)
-
-**Table of contents**:
-
-- :ref:`Requirements `
-- :ref:`Quick start using Dockerfile `
-- :ref:`Build from source `
-- :ref:`Performance tips `
-- :ref:`Limitations `
-
-.. _openvino_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Instruction set architecture (ISA) requirement: at least AVX2.
-
-.. _openvino_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
- $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
- $ docker run -it --rm vllm-openvino-env
-
-.. _install_openvino_backend_from_source:
-
-Install from source
--------------------
-
-- First, install Python. For example, on Ubuntu 22.04, you can run:
-
- .. code-block:: console
-
- $ sudo apt-get update -y
- $ sudo apt-get install python3
-
-- Second, install prerequisites vLLM OpenVINO backend installation:
-
- .. code-block:: console
-
- $ pip install --upgrade pip
- $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, install vLLM with OpenVINO backend:
-
- .. code-block:: console
-
- $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-
-- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html `_.
-
-.. _openvino_backend_performance_tips:
-
-Performance tips
-----------------
-
-vLLM OpenVINO backend environment variables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default.
-
-- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as ``
-
-CPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-
-CPU uses the following environment variables to control behavior:
-
-- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
-
-OpenVINO best known configuration for CPU is:
-
-.. code-block:: console
-
- $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
- python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-
-GPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-.. code-block:: console
-
- $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
- python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-
-.. _openvino_backend_limitations:
-
-Limitations
------------
-
-- LoRA serving is not supported.
-
-- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
-
-- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
new file mode 100644
index 0000000000000..165e5df146dcd
--- /dev/null
+++ b/docs/source/getting_started/quickstart.md
@@ -0,0 +1,175 @@
+(quickstart)=
+
+# Quickstart
+
+This guide will help you quickly get started with vLLM to:
+
+- [Run offline batched inference](#offline-batched-inference)
+- [Run OpenAI-compatible inference](#openai-compatible-server)
+
+## Prerequisites
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Installation
+
+You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+```console
+$ conda create -n myenv python=3.10 -y
+$ conda activate myenv
+$ pip install vllm
+```
+
+Please refer to the {ref}`installation documentation ` for more details on installing vLLM.
+
+(offline-batched-inference)=
+
+## Offline Batched Inference
+
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script:
+
+The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
+
+- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine.
+- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process.
+
+```python
+from vllm import LLM, SamplingParams
+```
+
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+
+```python
+prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+```
+
+The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models).
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+```{note}
+By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+```
+
+Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
+
+```python
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+(openai-compatible-server)=
+
+## OpenAI-Compatible Server
+
+vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
+By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints.
+
+Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
+
+```console
+$ vllm serve Qwen/Qwen2.5-1.5B-Instruct
+```
+
+```{note}
+By default, the server uses a predefined chat template stored in the tokenizer.
+You can learn about overriding it [here](#chat-template).
+```
+
+This server can be queried in the same format as OpenAI API. For example, to list the models:
+
+```console
+$ curl http://localhost:8000/v1/models
+```
+
+You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+
+### OpenAI Completions API with vLLM
+
+Once your server is started, you can query the model with input prompts:
+
+```console
+$ curl http://localhost:8000/v1/completions \
+$ -H "Content-Type: application/json" \
+$ -d '{
+$ "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$ "prompt": "San Francisco is a",
+$ "max_tokens": 7,
+$ "temperature": 0
+$ }'
+```
+
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+ api_key=openai_api_key,
+ base_url=openai_api_base,
+)
+completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+ prompt="San Francisco is a")
+print("Completion result:", completion)
+```
+
+A more detailed client example can be found here:
+
+### OpenAI Chat Completions API with vLLM
+
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
+
+```console
+$ curl http://localhost:8000/v1/chat/completions \
+$ -H "Content-Type: application/json" \
+$ -d '{
+$ "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$ "messages": [
+$ {"role": "system", "content": "You are a helpful assistant."},
+$ {"role": "user", "content": "Who won the world series in 2020?"}
+$ ]
+$ }'
+```
+
+Alternatively, you can use the `openai` python package:
+
+```python
+from openai import OpenAI
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+ api_key=openai_api_key,
+ base_url=openai_api_base,
+)
+
+chat_response = client.chat.completions.create(
+ model="Qwen/Qwen2.5-1.5B-Instruct",
+ messages=[
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "Tell me a joke."},
+ ]
+)
+print("Chat response:", chat_response)
+```
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
deleted file mode 100644
index 0c0491c860563..0000000000000
--- a/docs/source/getting_started/quickstart.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-.. _quickstart:
-
-==========
-Quickstart
-==========
-
-This guide will help you quickly get started with vLLM to:
-
-* :ref:`Run offline batched inference `
-* :ref:`Run OpenAI-compatible inference `
-
-Prerequisites
---------------
-- OS: Linux
-- Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Installation
---------------
-
-You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments.
-
-.. code-block:: console
-
- $ conda create -n myenv python=3.10 -y
- $ conda activate myenv
- $ pip install vllm
-
-Please refer to the :ref:`installation documentation ` for more details on installing vLLM.
-
-.. _offline_batched_inference:
-
-Offline Batched Inference
--------------------------
-
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__.
-
-The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`:
-
-- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine.
-- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process.
-
-.. code-block:: python
-
- from vllm import LLM, SamplingParams
-
-The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__.
-
-.. code-block:: python
-
- prompts = [
- "Hello, my name is",
- "The president of the United States is",
- "The capital of France is",
- "The future of AI is",
- ]
- sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `.
-
-.. code-block:: python
-
- llm = LLM(model="facebook/opt-125m")
-
-.. note::
-
- By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine.
-
-Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens.
-
-.. code-block:: python
-
- outputs = llm.generate(prompts, sampling_params)
-
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-.. _openai_compatible_server:
-
-OpenAI-Compatible Server
-------------------------
-
-vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints.
-
-Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model:
-
-.. code-block:: console
-
- $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
-
-.. note::
-
- By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__.
-
-This server can be queried in the same format as OpenAI API. For example, to list the models:
-
-.. code-block:: console
-
- $ curl http://localhost:8000/v1/models
-
-You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header.
-
-OpenAI Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once your server is started, you can query the model with input prompts:
-
-.. code-block:: console
-
- $ curl http://localhost:8000/v1/completions \
- $ -H "Content-Type: application/json" \
- $ -d '{
- $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
- $ "prompt": "San Francisco is a",
- $ "max_tokens": 7,
- $ "temperature": 0
- $ }'
-
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package:
-
-.. code-block:: python
-
- from openai import OpenAI
-
- # Modify OpenAI's API key and API base to use vLLM's API server.
- openai_api_key = "EMPTY"
- openai_api_base = "http://localhost:8000/v1"
- client = OpenAI(
- api_key=openai_api_key,
- base_url=openai_api_base,
- )
- completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
- prompt="San Francisco is a")
- print("Completion result:", completion)
-
-A more detailed client example can be found `here `__.
-
-OpenAI Chat Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
-
-You can use the `create chat completion `_ endpoint to interact with the model:
-
-.. code-block:: console
-
- $ curl http://localhost:8000/v1/chat/completions \
- $ -H "Content-Type: application/json" \
- $ -d '{
- $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
- $ "messages": [
- $ {"role": "system", "content": "You are a helpful assistant."},
- $ {"role": "user", "content": "Who won the world series in 2020?"}
- $ ]
- $ }'
-
-Alternatively, you can use the ``openai`` python package:
-
-.. code-block:: python
-
- from openai import OpenAI
- # Set OpenAI's API key and API base to use vLLM's API server.
- openai_api_key = "EMPTY"
- openai_api_base = "http://localhost:8000/v1"
-
- client = OpenAI(
- api_key=openai_api_key,
- base_url=openai_api_base,
- )
-
- chat_response = client.chat.completions.create(
- model="Qwen/Qwen2.5-1.5B-Instruct",
- messages=[
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": "Tell me a joke."},
- ]
- )
- print("Chat response:", chat_response)
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
new file mode 100644
index 0000000000000..f2a949e7247d8
--- /dev/null
+++ b/docs/source/getting_started/tpu-installation.md
@@ -0,0 +1,192 @@
+(installation-tpu)=
+
+# Installation with TPU
+
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
+For more information on the TPU versions supported with vLLM, see:
+
+- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
+- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
+- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
+- [TPU v4](https://cloud.google.com/tpu/docs/v4)
+
+These TPU versions allow you to configure the physical arrangements of the TPU
+chips. This can improve throughput and networking performance. For more
+information see:
+
+- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
+- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
+- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
+- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you
+want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
+
+For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
+
+You may need additional persistent storage for your TPU VMs. For more
+information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
+
+## Requirements
+
+- Google Cloud TPU VM
+- TPU versions: v6e, v5e, v5p, v4
+- Python: 3.10 or newer
+
+### Provision Cloud TPUs
+
+You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
+or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
+API. This section shows how to create TPUs using the queued resource API. For
+more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
+Queued resources enable you to request Cloud TPU resources in a queued manner.
+When you request queued resources, the request is added to a queue maintained by
+the Cloud TPU service. When the requested resource becomes available, it's
+assigned to your Google Cloud project for your immediate exclusive use.
+
+```{note}
+In all of the following commands, replace the ALL CAPS parameter names with
+appropriate values. See the parameter descriptions table for more information.
+```
+
+## Provision a Cloud TPU with the queued resource API
+
+Create a TPU v5e with 4 TPU chips:
+
+```console
+gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+--node-id TPU_NAME \
+--project PROJECT_ID \
+--zone ZONE \
+--accelerator-type ACCELERATOR_TYPE \
+--runtime-version RUNTIME_VERSION \
+--service-account SERVICE_ACCOUNT
+```
+
+```{eval-rst}
+.. list-table:: Parameter descriptions
+ :header-rows: 1
+
+ * - Parameter name
+ - Description
+ * - QUEUED_RESOURCE_ID
+ - The user-assigned ID of the queued resource request.
+ * - TPU_NAME
+ - The user-assigned name of the TPU which is created when the queued
+ resource request is allocated.
+ * - PROJECT_ID
+ - Your Google Cloud project
+ * - ZONE
+ - The GCP zone where you want to create your Cloud TPU. The value you use
+ depends on the version of TPUs you are using. For more information, see
+ `TPU regions and zones `_
+ * - ACCELERATOR_TYPE
+ - The TPU version you want to use. Specify the TPU version, for example
+ `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+ see `TPU versions `_.
+ * - RUNTIME_VERSION
+ - The TPU VM runtime version to use. For more information see `TPU VM images `_.
+ * - SERVICE_ACCOUNT
+ - The email address for your service account. You can find it in the IAM
+ Cloud Console under *Service Accounts*. For example:
+ `tpu-service-account@.iam.gserviceaccount.com`
+```
+
+Connect to your TPU using SSH:
+
+```bash
+gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
+```
+
+Install Miniconda
+
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+Create and activate a Conda environment for vLLM:
+
+```bash
+conda create -n vllm python=3.10 -y
+conda activate vllm
+```
+
+Clone the vLLM repository and go to the vLLM directory:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git && cd vllm
+```
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+```bash
+pip uninstall torch torch-xla -y
+```
+
+Install build dependencies:
+
+```bash
+pip install -r requirements-tpu.txt
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+
+Run the setup script:
+
+```bash
+VLLM_TARGET_DEVICE="tpu" python setup.py develop
+```
+
+## Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see
+
+
+
+
+(build-docker-tpu)=
+
+## Build a docker image with {code}`Dockerfile.tpu`
+
+You can use to build a Docker image with TPU support.
+
+```console
+$ docker build -f Dockerfile.tpu -t vllm-tpu .
+```
+
+Run the Docker image with the following command:
+
+```console
+$ # Make sure to add `--privileged --net host --shm-size=16G`.
+$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+```
+
+```{note}
+Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
+possible input shapes and compiles an XLA graph for each shape. The
+compilation time may take 20~30 minutes in the first run. However, the
+compilation time reduces to ~5 minutes afterwards because the XLA graphs are
+cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
+```
+
+````{tip}
+If you encounter the following error:
+
+```console
+from torch._C import * # noqa: F403
+ImportError: libopenblas.so.0: cannot open shared object file: No such
+file or directory
+```
+
+Install OpenBLAS with the following command:
+
+```console
+$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+````
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
deleted file mode 100644
index 22cc684a1c778..0000000000000
--- a/docs/source/getting_started/tpu-installation.rst
+++ /dev/null
@@ -1,200 +0,0 @@
-.. _installation_tpu:
-
-#####################
-Installation with TPU
-#####################
-
-Tensor Processing Units (TPUs) are Google's custom-developed application-specific
-integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
-are available in different versions each with different hardware specifications.
-For more information about TPUs, see `TPU System Architecture `_.
-For more information on the TPU versions supported with vLLM, see:
-
-* `TPU v6e `_
-* `TPU v5e `_
-* `TPU v5p `_
-* `TPU v4 `_
-
-These TPU versions allow you to configure the physical arrangements of the TPU
-chips. This can improve throughput and networking performance. For more
-information see:
-
-* `TPU v6e topologies `_
-* `TPU v5e topologies `_
-* `TPU v5p topologies `_
-* `TPU v4 topologies `_
-
-In order for you to use Cloud TPUs you need to have TPU quota granted to your
-Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
-GPC project and are specified in terms of TPU version, the number of TPU you
-want to use, and quota type. For more information, see `TPU quota `_.
-
-For TPU pricing information, see `Cloud TPU pricing `_.
-
-You may need additional persistent storage for your TPU VMs. For more
-information, see `Storage options for Cloud TPU data `_.
-
-Requirements
-------------
-
-* Google Cloud TPU VM
-* TPU versions: v6e, v5e, v5p, v4
-* Python: 3.10 or newer
-
-Provision Cloud TPUs
-====================
-
-You can provision Cloud TPUs using the `Cloud TPU API `_
-or the `queued resources `_
-API. This section shows how to create TPUs using the queued resource API. For
-more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_.
-Queued resources enable you to request Cloud TPU resources in a queued manner.
-When you request queued resources, the request is added to a queue maintained by
-the Cloud TPU service. When the requested resource becomes available, it's
-assigned to your Google Cloud project for your immediate exclusive use.
-
-.. note::
- In all of the following commands, replace the ALL CAPS parameter names with
- appropriate values. See the parameter descriptions table for more information.
-
-Provision a Cloud TPU with the queued resource API
---------------------------------------------------
-Create a TPU v5e with 4 TPU chips:
-
-.. code-block:: console
-
- gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
- --node-id TPU_NAME \
- --project PROJECT_ID \
- --zone ZONE \
- --accelerator-type ACCELERATOR_TYPE \
- --runtime-version RUNTIME_VERSION \
- --service-account SERVICE_ACCOUNT
-
-
-.. list-table:: Parameter descriptions
- :header-rows: 1
-
- * - Parameter name
- - Description
- * - QUEUED_RESOURCE_ID
- - The user-assigned ID of the queued resource request.
- * - TPU_NAME
- - The user-assigned name of the TPU which is created when the queued
- resource request is allocated.
- * - PROJECT_ID
- - Your Google Cloud project
- * - ZONE
- - The GCP zone where you want to create your Cloud TPU. The value you use
- depends on the version of TPUs you are using. For more information, see
- `TPU regions and zones `_
- * - ACCELERATOR_TYPE
- - The TPU version you want to use. Specify the TPU version, for example
- `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
- see `TPU versions `_.
- * - RUNTIME_VERSION
- - The TPU VM runtime version to use. For more information see `TPU VM images `_.
- * - SERVICE_ACCOUNT
- - The email address for your service account. You can find it in the IAM
- Cloud Console under *Service Accounts*. For example:
- `tpu-service-account@