diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py new file mode 100644 index 0000000000000..8350e2705141e --- /dev/null +++ b/.buildkite/generate_index.py @@ -0,0 +1,24 @@ +import argparse +import os + +template = """ + + +

Links for vLLM

+ {wheel}
+ + +""" + +parser = argparse.ArgumentParser() +parser.add_argument("--wheel", help="The wheel path.", required=True) +args = parser.parse_args() + +filename = os.path.basename(args.wheel) + +with open("index.html", "w") as f: + print(f"Generated index.html for {args.wheel}") + # cloudfront requires escaping the '+' character + f.write( + template.format(wheel=filename, + wheel_html_escaped=filename.replace("+", "%2B"))) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 64ba1b32fb074..708e548727cf5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -65,9 +65,9 @@ steps: - VLLM_USAGE_SOURCE - HF_TOKEN - - block: "Run H100 Benchmark" - key: block-h100 - depends_on: ~ + #- block: "Run H100 Benchmark" + #key: block-h100 + #depends_on: ~ - label: "H100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 7345dd4e66b29..3c756659a715a 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -23,6 +23,8 @@ wheel="$new_wheel" version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) echo "Version: $version" +normal_wheel="$wheel" # Save the original wheel filename + # If the version contains "dev", rename it to v1.0.0.dev for consistency if [[ $version == *dev* ]]; then suffix="${version##*.}" @@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then new_version="1.0.0.dev" fi new_wheel="${wheel/$version/$new_version}" - mv -- "$wheel" "$new_wheel" + # use cp to keep both files in the artifacts directory + cp -- "$wheel" "$new_wheel" wheel="$new_wheel" version="$new_version" fi # Upload the wheel to S3 +python3 .buildkite/generate_index.py --wheel "$normal_wheel" + +# generate index for this commit aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" + +if [[ $normal_wheel == *"cu118"* ]]; then + # if $normal_wheel matches cu118, do not upload the index.html + echo "Skipping index files for cu118 wheels" +else + # only upload index.html for cu12 wheels (default wheels) + aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" + aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" +fi + +# generate index for nightly aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" + +if [[ $normal_wheel == *"cu118"* ]]; then + # if $normal_wheel matches cu118, do not upload the index.html + echo "Skipping index files for cu118 wheels" +else + # only upload index.html for cu12 wheels (default wheels) + aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" +fi + aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file diff --git a/.gitignore b/.gitignore index ceef6a5fba456..bb7e4d5b244a8 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,8 @@ instance/ docs/_build/ docs/source/getting_started/examples/*.rst !**/*.template.rst +docs/source/getting_started/examples/*.md +!**/*.template.md # PyBuilder .pybuilder/ diff --git a/Dockerfile b/Dockerfile index 0944050f7dfca..153bff9cf565f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.rst and +# docs/source/dev/dockerfile/dockerfile.md and # docs/source/assets/dev/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 @@ -163,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ @@ -240,9 +240,9 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index ebe226cf6d148..f163edc27cba8 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace +COPY requirements-build.txt requirements-build.txt ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ pip install -r requirements-build.txt @@ -37,9 +37,9 @@ FROM cpu-test-1 AS build WORKDIR /workspace/vllm +COPY requirements-common.txt requirements-common.txt +COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt COPY . . diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 2924ea4a49f54..94999630bae12 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -10,7 +10,8 @@ set -ex kill_gpu_processes() { # kill all processes on GPU. - pkill -f pt_main_thread + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 sleep 10 # remove vllm config file @@ -54,7 +55,7 @@ benchmark() { CUDA_VISIBLE_DEVICES=0 python3 \ -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -64,7 +65,7 @@ benchmark() { CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -87,7 +88,7 @@ benchmark() { --port 8100 \ --save-result \ --result-dir $results_folder \ - --result-filename disagg_prefill_2xtp4.json \ + --result-filename disagg_prefill_tp1.json \ --request-rate "inf" @@ -105,7 +106,7 @@ benchmark() { --port 8200 \ --save-result \ --result-dir $results_folder \ - --result-filename disagg_prefill_2xtp4.json \ + --result-filename disagg_prefill_tp1_overhead.json \ --request-rate "$qps" kill_gpu_processes @@ -118,7 +119,7 @@ main() { (which jq) || (apt-get -y install jq) (which socat) || (apt-get -y install socat) - pip install quart httpx + pip install quart httpx datasets cd "$(dirname "$0")" diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index d8d9e976dce76..eb5d891d0d4a5 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -1,13 +1,12 @@ #!/bin/bash -# Requirement: 8x H100 GPUs. +# Requirement: 2x GPUs. -# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV -# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests -# Resource: 8x H100 +# Model: meta-llama/Meta-Llama-3.1-8B-Instruct +# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests +# Resource: 2x GPU # Approaches: -# 1. Chunked prefill: 1 vllm instance with tp=8 # 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance # Prefilling instance: max_output_token=1 @@ -114,7 +113,6 @@ benchmark() { --request-rate "$qps" sleep 2 - } @@ -123,8 +121,9 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get -y install jq) (which socat) || (apt-get -y install socat) + (which lsof) || (apt-get -y install lsof) - pip install quart httpx matplotlib aiohttp + pip install quart httpx matplotlib aiohttp datasets cd "$(dirname "$0")" diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index ca2da4cd66d2d..4859c8ac08bea 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==6.2.1 sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 -myst-parser==2.0.0 +myst-parser==3.0.1 sphinx-argparse==0.4.0 msgspec cloudpickle diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md new file mode 100644 index 0000000000000..c0c141c5fb7ef --- /dev/null +++ b/docs/source/automatic_prefix_caching/apc.md @@ -0,0 +1,102 @@ +(apc)= + +# Introduction + +## What is Automatic Prefix Caching + +Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. + +```{note} +Technical details on how vLLM implements APC are in the next page. +``` + +## Enabling APC in vLLM + +Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: + +```python +import time +from vllm import LLM, SamplingParams + + +# A prompt containing a large markdown table. The table is randomly generated by GPT-4. +LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | +|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| +| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | +| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | +| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | +| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | +| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | +| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | +| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | +| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | +| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | +| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| +| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | +| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | +| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | +| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | +| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | +| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | +| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | +| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | +| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | +| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | +| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | +| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | +| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| +| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | +| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | +| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | +| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | +| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | +| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | +| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | +""" + + +def get_generation_time(llm, sampling_params, prompts): + # time the generation + start_time = time.time() + output = llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + # print the output and generation time + print(f"Output: {output[0].outputs[0].text}") + print(f"Generation time: {end_time - start_time} seconds.") + + +# set enable_prefix_caching=True to enable APC +llm = LLM( + model='lmsys/longchat-13b-16k', + enable_prefix_caching=True +) + +sampling_params = SamplingParams(temperature=0, max_tokens=100) + +# Querying the age of John Doe +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", +) + +# Querying the age of Zack Blue +# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", +) +``` + +## Example workloads + +We describe two example workloads, where APC can provide huge performance benefit: + +- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. +- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. + +## Limits + +APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst deleted file mode 100644 index 0d70c74689bf9..0000000000000 --- a/docs/source/automatic_prefix_caching/apc.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _apc: - -Introduction -============ - -What is Automatic Prefix Caching --------------------------------- - -Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. - - -.. note:: - - Technical details on how vLLM implements APC are in the next page. - - - -Enabling APC in vLLM --------------------- - -Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example: - -.. code-block:: python - - import time - from vllm import LLM, SamplingParams - - - # A prompt containing a large markdown table. The table is randomly generated by GPT-4. - LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ - | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | - |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| - | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | - | 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | - | 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | - | 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | - | 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | - | 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | - | 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | - | 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | - | 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | - | 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| - | 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | - | 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | - | 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | - | 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | - | 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | - | 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | - | 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | - | 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | - | 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | - | 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | - | 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | - | 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | - | 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| - | 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | - | 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | - | 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | - | 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | - | 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | - | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | - | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | - """ - - - def get_generation_time(llm, sampling_params, prompts): - # time the generation - start_time = time.time() - output = llm.generate(prompts, sampling_params=sampling_params) - end_time = time.time() - # print the output and generation time - print(f"Output: {output[0].outputs[0].text}") - print(f"Generation time: {end_time - start_time} seconds.") - - - # set enable_prefix_caching=True to enable APC - llm = LLM( - model='lmsys/longchat-13b-16k', - enable_prefix_caching=True - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=100) - - # Querying the age of John Doe - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", - ) - - # Querying the age of Zack Blue - # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", - ) - -Example workloads ------------------ - -We describe two example workloads, where APC can provide huge performance benefit: - -- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. -- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. - - -Limits ------- -APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md new file mode 100644 index 0000000000000..43fa9ee616096 --- /dev/null +++ b/docs/source/community/meetups.md @@ -0,0 +1,15 @@ +(meetups)= + +# vLLM Meetups + +We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: + +- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) +- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) +- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) +- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing) +- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing) +- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg) +- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing) + +We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst deleted file mode 100644 index c87f01aa263b3..0000000000000 --- a/docs/source/community/meetups.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. _meetups: - -vLLM Meetups -============ - -We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - -- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ -- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ -- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ -- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ -- `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ -- `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ -- `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ - -We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__. diff --git a/docs/source/conf.py b/docs/source/conf.py index e9d9ac68c9560..1fe0474631140 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,7 +51,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.rst"] +exclude_patterns: List[str] = ["**/*.template.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " @@ -74,6 +74,35 @@ html_static_path = ["_static"] html_js_files = ["custom.js"] +myst_url_schemes = { + 'http': None, + 'https': None, + 'mailto': None, + 'ftp': None, + "gh-issue": { + "url": + "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", + "title": "Issue #{{path}}", + "classes": ["github"], + }, + "gh-pr": { + "url": + "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", + "title": "Pull Request #{{path}}", + "classes": ["github"], + }, + "gh-dir": { + "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}", + "title": "{{path}}", + "classes": ["github"], + }, + "gh-file": { + "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}", + "title": "{{path}}", + "classes": ["github"], + }, +} + # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') if READTHEDOCS_VERSION_TYPE == "tag": diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md new file mode 100644 index 0000000000000..6535414a7dca4 --- /dev/null +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -0,0 +1,50 @@ +# Dockerfile + +We provide a to construct the image for running an OpenAI compatible server with vLLM. +More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md). + +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: + +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) + +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) + +- COPY --from=... dependencies (with a dashed line and an empty arrow head) + +- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) + + > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > :align: center + > :alt: query + > :width: 100% + > ``` + > + > Made using: + > + > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): + > + > ```bash + > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile + > ``` + > + > or in case you want to run it directly with the docker image: + > + > ```bash + > docker run \ + > --rm \ + > --user "$(id -u):$(id -g)" \ + > --workdir /workspace \ + > --volume "$(pwd)":/workspace \ + > ghcr.io/patrickhoefler/dockerfilegraph:alpine \ + > --output png \ + > --dpi 200 \ + > --max-label-length 50 \ + > --filename Dockerfile \ + > --legend + > ``` + > + > (To run it for a different file, you can pass in a different argument to the flag `--filename`.) diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst deleted file mode 100644 index 9c17c27aa61bf..0000000000000 --- a/docs/source/contributing/dockerfile/dockerfile.rst +++ /dev/null @@ -1,50 +0,0 @@ -Dockerfile -==================== - -See `here `__ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. - -Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: - -- All build stages -- The default build target (highlighted in grey) -- External images (with dashed borders) - -The edges of the build graph represent: - -- FROM ... dependencies (with a solid line and a full arrow head) -- COPY --from=... dependencies (with a dashed line and an empty arrow head) -- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) - - .. figure:: ../../assets/dev/dockerfile-stages-dependency.png - :alt: query - :width: 100% - :align: center - - Made using: https://github.com/patrickhoefler/dockerfilegraph - - Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): - - .. code:: bash - - dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile - - or in case you want to run it directly with the docker image: - - .. code:: bash - - docker run \ - --rm \ - --user "$(id -u):$(id -g)" \ - --workdir /workspace \ - --volume "$(pwd)":/workspace \ - ghcr.io/patrickhoefler/dockerfilegraph:alpine \ - --output png \ - --dpi 200 \ - --max-label-length 50 \ - --filename Dockerfile \ - --legend - - (To run it for a different file, you can pass in a different argument to the flag `--filename`.) - - \ No newline at end of file diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md similarity index 51% rename from docs/source/contributing/overview.rst rename to docs/source/contributing/overview.md index 4cea0afdaea74..9dac41cff0bcb 100644 --- a/docs/source/contributing/overview.rst +++ b/docs/source/contributing/overview.md @@ -1,5 +1,4 @@ -Contributing to vLLM -===================== +# Contributing to vLLM Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: @@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! -License -------- +## License -See `LICENSE `_. +See . -Developing ----------- +## Developing -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. +Check out the [building from source](#build-from-source) documentation for details. -Testing -------- +## Testing -.. code-block:: bash +```bash +pip install -r requirements-dev.txt - pip install -r requirements-dev.txt +# linting and formatting +bash format.sh +# Static type checking +mypy +# Unit tests +pytest tests/ +``` - # linting and formatting - bash format.sh - # Static type checking - mypy - # Unit tests - pytest tests/ +```{note} +Currently, the repository does not pass the `mypy` tests. +``` -.. note:: Currently, the repository does not pass the ``mypy`` tests. +# Contribution Guidelines -Contribution Guidelines -======================= +## Issues -Issues ------- +If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. +```{important} +If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). +``` -.. important:: - If you discover a security vulnerability, please follow the instructions `here `_. - -Pull Requests & Code Reviews ----------------------------- +## Pull Requests & Code Reviews Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process. -DCO and Signed-off-by -^^^^^^^^^^^^^^^^^^^^^ +### DCO and Signed-off-by -When contributing changes to this project, you must agree to the `DCO `_. -Commits must include a ``Signed-off-by:`` header which certifies agreement with -the terms of the `DCO `_. +When contributing changes to this project, you must agree to the . +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the DCO. -Using ``-s`` with ``git commit`` will automatically add this header. +Using `-s` with `git commit` will automatically add this header. -PR Title and Classification -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### PR Title and Classification Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: -- ``[Bugfix]`` for bug fixes. -- ``[CI/Build]`` for build or continuous integration improvements. -- ``[Doc]`` for documentation fixes and improvements. -- ``[Model]`` for adding a new model or improving an existing model. Model name +- `[Bugfix]` for bug fixes. +- `[CI/Build]` for build or continuous integration improvements. +- `[Doc]` for documentation fixes and improvements. +- `[Model]` for adding a new model or improving an existing model. Model name should appear in the title. -- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, - ``LLM`` class, etc.) -- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. -- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, - ``AsyncLLMEngine``, ``Scheduler``, etc.) -- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should - appear in the prefix (e.g., ``[Hardware][AMD]``). -- ``[Misc]`` for PRs that do not fit the above categories. Please use this +- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server, + `LLM` class, etc.) +- `[Kernel]` for changes affecting CUDA kernels or other compute kernels. +- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`, + `AsyncLLMEngine`, `Scheduler`, etc.) +- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., `[Hardware][AMD]`). +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -.. note:: - If the PR spans more than one category, please include all relevant prefixes. +```{note} +If the PR spans more than one category, please include all relevant prefixes. +``` -Code Quality -^^^^^^^^^^^^ +### Code Quality The PR needs to meet the following code quality standards: -- We adhere to `Google Python style guide - `_ and `Google C++ style guide - `_. -- Pass all linter checks. Please use `format.sh - `_ to format your - code. +- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). +- Pass all linter checks. Please use to format your code. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This includes both unit tests and integration tests. -- Please add documentation to ``docs/source/`` if the PR modifies the +- Please add documentation to `docs/source/` if the PR modifies the user-facing behaviors of vLLM. It helps vLLM users understand and utilize the new features or changes. -Adding or Changing Kernels -^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Adding or Changing Kernels Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. - Make sure custom ops are registered following PyTorch guidelines: - `Custom C++ and CUDA Operators `_ - and `The Custom Operators Manual `_. -- Custom operations that return ``Tensors`` require meta-functions. + [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial) + and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU). +- Custom operations that return `Tensors` require meta-functions. Meta-functions should be implemented and registered in Python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions. -- Use `torch.library.opcheck() `_ +- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck) to test the function registration and meta-function for any registered ops. - See ``tests/kernels`` for examples. + See `tests/kernels` for examples. - When changing the C++ signature of an existing op, the schema must be updated to reflect the changes. - If a new custom type is needed, see the following document: - `Custom Class Support in PT2 `_. + [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA). -Notes for Large Changes -^^^^^^^^^^^^^^^^^^^^^^^ +### Notes for Large Changes Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag -it with ``rfc-required`` and might not go through the PR. +it with `rfc-required` and might not go through the PR. -What to Expect for the Reviews -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### What to Expect for the Reviews The goal of the vLLM team is to be a *transparent reviewing machine*. We would like to make the review process transparent and efficient and make sure no @@ -150,15 +138,14 @@ review process: - After the PR is assigned, the reviewer will provide status updates every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team. -- After the review, the reviewer will put an ``action-required`` label on the PR +- After the review, the reviewer will put an `action-required` label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR. - Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. -Thank You ---------- +## Thank You Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. All of your contributions help make vLLM a great tool and community for everyone! diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md new file mode 100644 index 0000000000000..46210957c19ec --- /dev/null +++ b/docs/source/contributing/profiling/profiling_index.md @@ -0,0 +1,41 @@ +# Profiling vLLM + +We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` + +The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. + +When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. + +```{warning} +Only enable profiling in a development environment. +``` + +Traces can be visualized using . + +```{tip} +Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. +``` + +```{tip} +To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. +Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. +`export VLLM_RPC_TIMEOUT=1800000` +``` + +## Example commands and usage + +### Offline Inference + +Refer to for an example. + +### OpenAI Server + +```bash +VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B +``` + +benchmark_serving.py: + +```bash +python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 +``` diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst deleted file mode 100644 index a422b1fcda521..0000000000000 --- a/docs/source/contributing/profiling/profiling_index.rst +++ /dev/null @@ -1,48 +0,0 @@ -============== -Profiling vLLM -============== - -We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` - -The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set. - -When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag. - -.. warning:: - - Only enable profiling in a development environment. - - -Traces can be visualized using https://ui.perfetto.dev/. - -.. tip:: - - Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. - -.. tip:: - - To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. - Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. - ``export VLLM_RPC_TIMEOUT=1800000`` - -Example commands and usage: -=========================== - -Offline Inference: ------------------- - -Refer to `examples/offline_inference_with_profiler.py `_ for an example. - - -OpenAI Server: --------------- - -.. code-block:: bash - - VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B - -benchmark_serving.py: - -.. code-block:: bash - - python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 \ No newline at end of file diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md similarity index 54% rename from docs/source/design/arch_overview.rst rename to docs/source/design/arch_overview.md index bc3f509f0a66e..475a3e5fa9ddc 100644 --- a/docs/source/design/arch_overview.rst +++ b/docs/source/design/arch_overview.md @@ -1,25 +1,24 @@ -.. _arch_overview: +(arch-overview)= -Architecture Overview -====================== +# Architecture Overview This document provides an overview of the vLLM architecture. -.. contents:: Table of Contents - :local: - :depth: 2 +```{contents} Table of Contents +:depth: 2 +:local: true +``` -Entrypoints ------------ +## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png - :alt: Entrypoints Diagram +```{image} /assets/design/arch_overview/entrypoints.excalidraw.png +:alt: Entrypoints Diagram +``` -LLM Class -^^^^^^^^^ +### LLM Class The LLM class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference @@ -27,75 +26,70 @@ server. Here is a sample of `LLM` class usage: -.. code-block:: python +```python +from vllm import LLM, SamplingParams - from vllm import LLM, SamplingParams +# Define a list of input prompts +prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", +] - # Define a list of input prompts - prompts = [ - "Hello, my name is", - "The capital of France is", - "The largest ocean is", - ] +# Define sampling parameters +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Define sampling parameters - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Initialize the LLM engine with the OPT-125M model +llm = LLM(model="facebook/opt-125m") - # Initialize the LLM engine with the OPT-125M model - llm = LLM(model="facebook/opt-125m") +# Generate outputs for the input prompts +outputs = llm.generate(prompts, sampling_params) - # Generate outputs for the input prompts - outputs = llm.generate(prompts, sampling_params) +# Print the generated outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` - # Print the generated outputs - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -More API details can be found in the :doc:`Offline Inference +More API details can be found in the {doc}`Offline Inference ` section of the API docs. -The code for the `LLM` class can be found in `vllm/entrypoints/llm.py -`_. +The code for the `LLM` class can be found in . -OpenAI-compatible API server -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### OpenAI-compatible API server The second primary interface to vLLM is via its OpenAI-compatible API server. This server can be started using the `vllm serve` command. -.. code-block:: bash - - vllm serve +```bash +vllm serve +``` -The code for the `vllm` CLI can be found in `vllm/scripts.py -`_. +The code for the `vllm` CLI can be found in . Sometimes you may see the API server entrypoint used directly instead of via the `vllm` CLI command. For example: -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --model +```bash +python -m vllm.entrypoints.openai.api_server --model +``` -That code can be found in `vllm/entrypoints/openai/api_server.py -`_. +That code can be found in . -More details on the API server can be found in the :doc:`OpenAI Compatible +More details on the API server can be found in the {doc}`OpenAI Compatible Server ` document. -LLM Engine ----------- +## LLM Engine The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png - :alt: LLMEngine Diagram +```{image} /assets/design/arch_overview/llm_engine.excalidraw.png +:alt: LLMEngine Diagram +``` -LLMEngine -^^^^^^^^^ +### LLMEngine The `LLMEngine` class is the core component of the vLLM engine. It is responsible for receiving requests from clients and generating outputs from the @@ -105,21 +99,15 @@ processing. - **Input Processing**: Handles tokenization of input text using the specified tokenizer. - - **Scheduling**: Chooses which requests are processed in each step. - - **Model Execution**: Manages the execution of the language model, including distributed execution across multiple GPUs. - - **Output Processing**: Processes the outputs generated by the model, decoding the token IDs from a language model into human-readable text. -The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_. - -.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py +The code for `LLMEngine` can be found in . -AsyncLLMEngine -^^^^^^^^^^^^^^ +### AsyncLLMEngine The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class. It uses `asyncio` to create a background loop that continuously processes @@ -127,55 +115,46 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it can handle multiple concurrent requests and stream outputs to clients. The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo -API server that serves as a simpler example in -`vllm/entrypoints/api_server.py`_. - -.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py +API server that serves as a simpler example in . -The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_. +The code for `AsyncLLMEngine` can be found in . -.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py - -Worker ------- +## Worker A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their -``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while -``local_rank`` is mainly used for assigning the accelerator device and accessing +`rank` and `local_rank`. `rank` is used for global orchestration, while +`local_rank` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. -Model Runner ------------- +## Model Runner Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. -Model ------ +## Model Every model runner object has one model object, which is the actual -``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various +`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various configurations affect the class we ultimately get. -Class Hierarchy ---------------- +## Class Hierarchy The following figure shows the class hierarchy of vLLM: - .. figure:: /assets/design/hierarchy.png - :alt: query - :width: 100% - :align: center +> ```{figure} /assets/design/hierarchy.png +> :align: center +> :alt: query +> :width: 100% +> ``` There are several important design choices behind this class hierarchy: -1. **Extensibility**: All classes in the hierarchy accept a configuration object -containing all the necessary information. The `VllmConfig -`__ +1\. **Extensibility**: All classes in the hierarchy accept a configuration object +containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036) class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily @@ -188,7 +167,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. -2. **Uniformity**: The model runner needs a unified interface to create and +2\. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the @@ -200,46 +179,46 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -.. note:: - - To support this change, all vLLM models' signatures have been updated to: - - .. code-block:: python - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: - - .. code-block:: python - - class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - - from vllm.config import VllmConfig - class MyNewModel(MyOldModel): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - - if __version__ >= "0.6.4": - MyModel = MyNewModel - else: - MyModel = MyOldModel - - This way, the model can work with both old and new versions of vLLM. - -3. **Sharding and Quantization at Initialization**: Certain features require +````{note} +To support this change, all vLLM models' signatures have been updated to: + +```python +def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): +``` + +To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + +```python +class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + +from vllm.config import VllmConfig +class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + +if __version__ >= "0.6.4": + MyModel = MyNewModel +else: + MyModel = MyOldModel +``` + +This way, the model can work with both old and new versions of vLLM. +```` + +3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model @@ -252,23 +231,23 @@ initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea -applies to quantization. Note that we also add an additional argument ``prefix`` +applies to quantization. Note that we also add an additional argument `prefix` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where -different parts of the model are quantized differently. The ``prefix`` is -usually an empty string for the top-level model and a string like ``"vision"`` -or ``"language"`` for the sub-models. In general, it matches the name of the +different parts of the model are quantized differently. The `prefix` is +usually an empty string for the top-level model and a string like `"vision"` +or `"language"` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set -to ``None``. If the component we want to test only cares about a few fields in +to `None`. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. -In summary, the complete config object ``VllmConfig`` can be treated as an +In summary, the complete config object `VllmConfig` can be treated as an engine-level global state that is shared among all vLLM classes. diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md new file mode 100644 index 0000000000000..99b4cb56424c6 --- /dev/null +++ b/docs/source/design/huggingface_integration.md @@ -0,0 +1,36 @@ +(huggingface-integration)= + +# Integration with HuggingFace + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. + +Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. + +1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: + + - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. + - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. + +3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst deleted file mode 100644 index e6c1cea6001ea..0000000000000 --- a/docs/source/design/huggingface_integration.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _huggingface_integration: - -Integration with HuggingFace -=================================== - -This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. - -Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. - -1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: - - - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. - - - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. - -2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. - -3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: - - - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. - - - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. - -4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. - -5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. - -Beyond that, there are two more things vLLM depends on HuggingFace for. - -1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. - -2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. - - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: - -This completes the integration between vLLM and HuggingFace. - -In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md new file mode 100644 index 0000000000000..bb16920e3d0c0 --- /dev/null +++ b/docs/source/design/input_processing/input_processing_pipeline.md @@ -0,0 +1,19 @@ +(input-processing-pipeline)= + +# Input Processing Pipeline + +1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). + +2. Tokenize the data if necessary. + +3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. + + - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. + +4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. + +5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. + +6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. + + - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst deleted file mode 100644 index 48abec8f75286..0000000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _input_processing_pipeline: - -Input Processing Pipeline -========================= - -1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md new file mode 100644 index 0000000000000..cb415366e5a66 --- /dev/null +++ b/docs/source/design/input_processing/model_inputs_index.md @@ -0,0 +1,43 @@ +(input-processing)= + +# Input Processing + +```{eval-rst} +.. currentmodule:: vllm.inputs +``` + +Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via +{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. + +Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input +data in addition to input prompt, but it can be extended to text-only language models when needed. + +## Guides + +```{toctree} +:maxdepth: 1 + +input_processing_pipeline +``` + +## Module Contents + +### LLM Engine Inputs + +```{eval-rst} +.. autoclass:: vllm.inputs.DecoderOnlyInputs + :members: + :show-inheritance: +``` + +### Registry + +```{eval-rst} +.. autodata:: vllm.inputs.INPUT_REGISTRY +``` + +```{eval-rst} +.. automodule:: vllm.inputs.registry + :members: + :show-inheritance: +``` diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst deleted file mode 100644 index f0ec1fea15ddb..0000000000000 --- a/docs/source/design/input_processing/model_inputs_index.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. _input_processing: - -Input Processing -================ - -.. currentmodule:: vllm.inputs - -Each model can override parts of vLLM's :ref:`input processing pipeline ` via -:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -Guides -++++++ - -.. toctree:: - :maxdepth: 1 - - input_processing_pipeline - -Module Contents -+++++++++++++++ - -LLM Engine Inputs ------------------ - -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: - -Registry --------- - -.. autodata:: vllm.inputs.INPUT_REGISTRY - -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md new file mode 100644 index 0000000000000..c21985b36eb3a --- /dev/null +++ b/docs/source/design/kernel/paged_attention.md @@ -0,0 +1,527 @@ +# vLLM Paged Attention + +- Currently, vLLM utilizes its own implementation of a multi-head query + attention kernel (`csrc/attention/attention_kernels.cu`). + This kernel is designed to be compatible with + vLLM's paged KV caches, where the key and value cache are stored in + separate blocks (note that this block concept differs from the GPU + thread block. So in a later document, I will refer to vLLM paged + attention block as "block", while refer to GPU thread block as + "thread block"). +- To achieve high performance, this kernel relies on a specially + designed memory layout and access method, specifically when threads + read data from global memory to shared memory. The purpose of this + document is to provide a high-level explanation of the kernel + implementation step by step, aiding those who wish to learn about the + vLLM multi-head query attention kernel. After going through this + document, users will likely have a better understanding and feel easier + to follow the actual implementation. +- Please note that this document may not cover all details, such as how + to calculate the correct index for the corresponding data or the dot + multiplication implementation. However, after reading this document + and becoming familiar with the high-level logic flow, it should be + easier for you to read the actual code and understand the details. + +## Inputs + +- The kernel function takes a list of arguments for the current thread + to perform its assigned work. The three most important arguments are + the input pointers `q`, `k_cache`, and `v_cache`, which point + to query, key, and value data on global memory that need to be read + and processed. The output pointer `out` points to global memory + where the result should be written. These four pointers actually + refer to multi-dimensional arrays, but each thread only accesses the + portion of data assigned to it. I have omitted all other runtime + parameters here for simplicity. + + ```cpp + template< + typename scalar_t, + int HEAD_SIZE, + int BLOCK_SIZE, + int NUM_THREADS, + int PARTITION_SIZE = 0> + __device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. + ) + ``` + +- There are also a list of template arguments above the function + signature that are determined during compilation time. `scalar_t` + represents the data type of the query, key, and value data elements, + such as FP16. `HEAD_SIZE` indicates the number of elements in each + head. `BLOCK_SIZE` refers to the number of tokens in each block. + `NUM_THREADS` denotes the number of threads in each thread block. + `PARTITION_SIZE` represents the number of tensor parallel GPUs (For + simplicity, we assume this is 0 and tensor parallel is disabled). + +- With these arguments, we need to perform a sequence of preparations. + This includes calculating the current head index, block index, and + other necessary variables. However, for now, we can ignore these + preparations and proceed directly to the actual calculations. It will + be easier to understand them once we grasp the entire flow. + +## Concepts + +- Just before we dive into the calculation flow, I want to describe a + few concepts that are needed for later sections. However, you may + skip this section and return later if you encounter any confusing + terminologies. +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by `q` has a shape of + `[num_seqs, num_heads, head_size]`. That represents there are total + `num_seqs` of query sequence data are pointed by `q`. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the `num_seqs` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, `["What", "is", "your"]` are the context + tokens, and the input query token is `"name"`. The model might + generate the token `"?"`. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (`VEC_SIZE`) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (`V_VEC_SIZE`) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the + `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8. +- **Thread group**: The thread group is a small group of + threads(`THREAD_GROUP_SIZE`) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as `x`. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(`BLOCK_SIZE`) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 * 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(`NUM_THREADS`) that can access the same shared memory. + Each thread block contains multiple warps(`NUM_WARPS`), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +## Query + +- This section will introduce how query data is stored in memory and + fetched by each thread. As mentioned above, each thread group fetches + one query token data, while each thread itself only handles a part of + one query token data. Within each warp, every thread group will fetch + the same query token data, but will multiply it with different key + token data. + + ```cpp + const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; + ``` + + ```{figure} ../../assets/kernel/query.png + :align: center + :alt: query + :width: 70% + + Query data of one token at one head + ``` + +- Each thread defines its own `q_ptr` which points to the assigned + query token data on global memory. For example, if `VEC_SIZE` is 4 + and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains + total of 128 elements divided into 128 / 4 = 32 vecs. + + ```{figure} ../../assets/kernel/q_vecs.png + :align: center + :alt: q_vecs + :width: 70% + + `q_vecs` for one thread group + ``` + + ```cpp + __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; + ``` + +- Next, we need to read the global memory data pointed to by `q_ptr` + into shared memory as `q_vecs`. It is important to note that each + vecs is assigned to a different row. For example, if the + `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, + while thread 1 handles the 1st row vecs. By reading the query data in + this way, neighboring threads like thread 0 and thread 1 can read + neighbor memory, achieving the memory coalescing to improve + performance. + +## Key + +- Similar to the "Query" section, this section introduces memory layout + and assignment for keys. While each thread group only handle one + query token one kernel run, it may handle multiple key tokens across + multiple iterations. Meanwhile, each warp will process multiple blocks + of key tokens in multiple iterations, ensuring that all context + tokens are processed by the entire thread group after the kernel run. + In this context, "handle" refers to performing the dot multiplication + between query data and key data. + + ```cpp + const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; + ``` + +- Unlike to `q_ptr`, `k_ptr` in each thread will point to different + key token at different iterations. As shown above, that `k_ptr` + points to key token data based on `k_cache` at assigned block, + assigned head and assigned token. + + ```{figure} ../../assets/kernel/key.png + :align: center + :alt: key + :width: 70% + + Key data of all context tokens at one head + ``` + +- The diagram above illustrates the memory layout for key data. It + assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is + 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each + rectangle represents all the elements for one key token at one head, + which will be processed by one thread group. The left half shows the + total 16 blocks of key token data for warp 0, while the right half + represents the remaining key token data for other warps or + iterations. Inside each rectangle, there are a total 32 vecs (128 + elements for one token) that will be processed by 2 threads (one + thread group) separately. + + ```{figure} ../../assets/kernel/k_vecs.png + :align: center + :alt: k_vecs + :width: 70% + + `k_vecs` for one thread + ``` + + ```cpp + K_vec k_vecs[NUM_VECS_PER_THREAD] + ``` + +- Next, we need to read the key token data from `k_ptr` and store + them on register memory as `k_vecs`. We use register memory for + `k_vecs` because it will only be accessed by one thread once, + whereas `q_vecs` will be accessed by multiple threads multiple + times. Each `k_vecs` will contain multiple vectors for later + calculation. Each vec will be set at each inner iteration. The + assignment of vecs allows neighboring threads in a warp to read + neighboring memory together, which again promotes the memory + coalescing. For instance, thread 0 will read vec 0, while thread 1 + will read vec 1. In the next inner loop, thread 0 will read vec 2, + while thread 1 will read vec 3, and so on. + +- You may still be a little confused about the overall flow. Don't + worry, please keep reading the next "QK" section. It will illustrate + the query and key calculation flow in a clearer and higher-level + manner. + +## QK + +- As shown the pseudo code below, before the entire for loop block, we + fetch the query data for one token and store it in `q_vecs`. Then, + in the outer for loop, we iterate through different `k_ptrs` that + point to different tokens and prepare the `k_vecs` in the inner for + loop. Finally, we perform the dot multiplication between the + `q_vecs` and each `k_vecs`. + + ```cpp + q_vecs = ... + for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + } + ``` + +- As mentioned before, for each thread, it only fetches part of the + query and key token data at a time. However, there will be a cross + thread group reduction happen in the `Qk_dot<>::dot` . So `qk` + returned here is not just between part of the query and key token dot + multiplication, but actually a full result between entire query and + key token data. + +- For example, if the value of `HEAD_SIZE` is 128 and + `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain + total 64 elements. However, the returned `qk` is actually the + result of dot multiplication between 128 query elements and 128 key + elements. If you want to learn more about the details of the dot + multiplication and reduction, you may refer to the implementation of + `Qk_dot<>::dot`. However, for the sake of simplicity, I will not + cover it in this document. + +## Softmax + +- Next, we need to calculate the normalized softmax for all `qk`s, + as shown above, where each $x$ represents a `qk`. To do this, + we must obtain the reduced value of `qk_max`($m(x)$) and + the `exp_sum`($\ell(x)$) of all `qk`s. The reduction + should be performed across the entire thread block, encompassing + results between the query token and all context key tokens. + + ```{math} + :nowrap: true + + \begin{gather*} + m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ + \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} + \end{gather*} + ``` + +### `qk_max` and `logits` + +- Just right after we get the `qk` result, we can set the temporary + `logits` result with `qk` (In the end, the `logits` should + store the normalized softmax result). Also we can compare and collect + the `qk_max` for all `qk`s that are calculated by current + thread group. + + ```cpp + if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + } + ``` + +- Please note that the `logits` here is on shared memory, so each + thread group will set the fields for its own assigned context tokens. + Overall, the size of logits should be number of context tokens. + + ```cpp + for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + + if (lane == 0) { + red_smem[warp_idx] = qk_max; + } + ``` + +- Then we need to get the reduced `qk_max` across each warp. The main + idea is to make threads in warp to communicate with each other and + get the final max `qk` . + + ```cpp + for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + qk_max = VLLM_SHFL_SYNC(qk_max, 0); + ``` + +- Finally, we can get the reduced `qk_max` from whole thread block by + compare the `qk_max` from all warps in this thread block. Then we + need to broadcast the final result to each thread. + +### `exp_sum` + +- Similar to `qk_max`, we need to get the reduced sum value from the + entire thread block too. + + ```cpp + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; + } + ... + exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); + ``` + +- Firstly, sum all exp values from each thread group, and meanwhile, + convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. + Please note, the `qk_max` here is already the max `qk` across the + whole thread block. And then we can do reduction for `exp_sum` + across whole thread block just like the `qk_max`. + + ```cpp + const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; + } + ``` + +- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain + the final normalized softmax result as `logits`. This `logits` + variable will be used for dot multiplication with the value data in + later steps. Now, it should store the normalized softmax result of + `qk` for all assigned context tokens. + +## Value + +```{figure} ../../assets/kernel/value.png +:align: center +:alt: value +:width: 70% + +Value data of all context tokens at one head +``` + +```{figure} ../../assets/kernel/logits_vec.png +:align: center +:alt: logits_vec +:width: 50% + +`logits_vec` for one thread +``` + +```{figure} ../../assets/kernel/v_vec.png +:align: center +:alt: v_vec +:width: 70% + +List of `v_vec` for one thread +``` + +- Now we need to retrieve the value data and perform dot multiplication + with `logits`. Unlike query and key, there is no thread group + concept for value data. As shown in diagram, different from key token + memory layout, elements from the same column correspond to the same + value token. For one block of value data, there are `HEAD_SIZE` of + rows and `BLOCK_SIZE` of columns that are split into multiple + `v_vecs`. + +- Each thread always fetches `V_VEC_SIZE` elements from the same + `V_VEC_SIZE` of tokens at a time. As a result, a single thread + retrieves multiple `v_vec`s from different rows and the same + columns through multiple inner iterations. For each `v_vec`, it + needs to be dot multiplied with the corresponding `logits_vec`, + which is also `V_VEC_SIZE` elements from `logits`. Overall, with + multiple inner iterations, each warp will process one block of value + tokens. And with multiple outer iterations, the whole context value + tokens are processd + + ```cpp + float accs[NUM_ROWS_PER_THREAD]; + for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } + } + ``` + +- As shown in the above pseudo code, in the outer loop, similar to + `k_ptr`, `logits_vec` iterates over different blocks and reads + `V_VEC_SIZE` elements from `logits`. In the inner loop, each + thread reads `V_VEC_SIZE` elements from the same tokens as a + `v_vec` and performs dot multiplication. It is important to note + that in each inner iteration, the thread fetches different head + position elements for the same tokens. The dot result is then + accumulated in `accs`. Therefore, each entry of `accs` is mapped + to a head position assigned to the current thread. + +- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each + thread fetches 8 value elements for 8 tokens at a time. Each element + is from different tokens at the same head position. If `HEAD_SIZE` + is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to + fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are + a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle + a whole block of value tokens. And each `accs` in each thread + contains 8 elements that accumulated at 8 different head positions. + For the thread 0, the `accs` variable will have 8 elements, which + are 0th, 32th … 224th elements of a value head that are accumulated + from all assigned 8 tokens. + +## LV + +- Now, we need to perform reduction for `accs` within each warp. This + process allows each thread to accumulate the `accs` for the + assigned head positions of all tokens in one block. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; + } + ``` + +- Next, we perform reduction for `accs` across all warps, allowing + each thread to have the accumulation of `accs` for the assigned + head positions of all context tokens. Please note that each `accs` + in every thread only stores the accumulation for a portion of + elements of the entire head for all context tokens. However, overall, + all results for output have been calculated but are just stored in + different thread register memory. + + ```cpp + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. + } + ``` + +## Output + +- Now we can write all of calculated result from local register memory + to final output global memory. + + ```cpp + scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; + ``` + +- First, we need to define the `out_ptr` variable, which points to + the start address of the assigned sequence and assigned head. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } + } + ``` + +- Finally, we need to iterate over different assigned head positions + and write out the corresponding accumulated result based on the + `out_ptr`. diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst deleted file mode 100644 index ba4f7a2718158..0000000000000 --- a/docs/source/design/kernel/paged_attention.rst +++ /dev/null @@ -1,525 +0,0 @@ -vLLM Paged Attention -==================== - -- Currently, vLLM utilizes its own implementation of a multi-head query - attention kernel (``csrc/attention/attention_kernels.cu``). - This kernel is designed to be compatible with - vLLM's paged KV caches, where the key and value cache are stored in - separate blocks (note that this block concept differs from the GPU - thread block. So in a later document, I will refer to vLLM paged - attention block as "block", while refer to GPU thread block as - "thread block"). -- To achieve high performance, this kernel relies on a specially - designed memory layout and access method, specifically when threads - read data from global memory to shared memory. The purpose of this - document is to provide a high-level explanation of the kernel - implementation step by step, aiding those who wish to learn about the - vLLM multi-head query attention kernel. After going through this - document, users will likely have a better understanding and feel easier - to follow the actual implementation. -- Please note that this document may not cover all details, such as how - to calculate the correct index for the corresponding data or the dot - multiplication implementation. However, after reading this document - and becoming familiar with the high-level logic flow, it should be - easier for you to read the actual code and understand the details. - -Inputs ------- - -- The kernel function takes a list of arguments for the current thread - to perform its assigned work. The three most important arguments are - the input pointers ``q``, ``k_cache``, and ``v_cache``, which point - to query, key, and value data on global memory that need to be read - and processed. The output pointer ``out`` points to global memory - where the result should be written. These four pointers actually - refer to multi-dimensional arrays, but each thread only accesses the - portion of data assigned to it. I have omitted all other runtime - parameters here for simplicity. - - .. code:: cpp - - template< - typename scalar_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - int PARTITION_SIZE = 0> - __device__ void paged_attention_kernel( - ... // Other side args. - const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - ... // Other side args. - ) - -- There are also a list of template arguments above the function - signature that are determined during compilation time. ``scalar_t`` - represents the data type of the query, key, and value data elements, - such as FP16. ``HEAD_SIZE`` indicates the number of elements in each - head. ``BLOCK_SIZE`` refers to the number of tokens in each block. - ``NUM_THREADS`` denotes the number of threads in each thread block. - ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For - simplicity, we assume this is 0 and tensor parallel is disabled). -- With these arguments, we need to perform a sequence of preparations. - This includes calculating the current head index, block index, and - other necessary variables. However, for now, we can ignore these - preparations and proceed directly to the actual calculations. It will - be easier to understand them once we grasp the entire flow. - -Concepts --------- - -- Just before we dive into the calculation flow, I want to describe a - few concepts that are needed for later sections. However, you may - skip this section and return later if you encounter any confusing - terminologies. -- **Sequence**: A sequence represents a client request. For example, - the data pointed to by ``q`` has a shape of - ``[num_seqs, num_heads, head_size]``. That represents there are total - ``num_seqs`` of query sequence data are pointed by ``q``. Since this - kernel is a single query attention kernel, each sequence only has one - query token. Hence, the ``num_seqs`` equals the total number of tokens - that are processed in the batch. -- **Context**: The context consists of the generated tokens from the - sequence. For instance, ``["What", "is", "your"]`` are the context - tokens, and the input query token is ``"name"``. The model might - generate the token ``"?"``. -- **Vec**: The vec is a list of elements that are fetched and - calculated together. For query and key data, the vec size - (``VEC_SIZE``) is determined so that each thread group can fetch and - calculate 16 bytes of data at a time. For value data, the vec size - (``V_VEC_SIZE``) is determined so that each thread can fetch and - calculate 16 bytes of data at a time. For example, if the - ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the - ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8. -- **Thread group**: The thread group is a small group of - threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one - query token and one key token at a time. Each thread handles only a - portion of the token data. The total number of elements processed by - one thread group is referred as ``x``. For example, if the thread - group contains 2 threads and the head size is 8, then thread 0 - handles the query and key elements at index 0, 2, 4, 6, while thread - 1 handles the elements at index 1, 3, 5, 7. -- **Block**: The key and value cache data in vLLM are split into - blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``) - of tokens at one head. Each block may contain only a portion of the - whole context tokens. For example, if the block size is 16 and the - head size is 128, then for one head, one block can store 16 \* 128 = - 2048 elements. -- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that - execute simultaneously on a stream multiprocessor (SM). In this - kernel, each warp processes the calculation between one query token - and key tokens of one entire block at a time (it may process multiple - blocks in multiple iterations). For example, if there are 4 warps and - 6 blocks for one context, the assignment would be like warp 0 handles - the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 - handles the 2nd block and warp 3 handles the 3rd block. -- **Thread block**: A thread block is a group of - threads(\ ``NUM_THREADS``) that can access the same shared memory. - Each thread block contains multiple warps(\ ``NUM_WARPS``), and in - this kernel, each thread block processes the calculation between one - query token and key tokens of a whole context. -- **Grid**: A grid is a collection of thread blocks and defines the - shape of the collection. In this kernel, the shape is - ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread - block only handles the calculation for one head, one sequence, and - one partition. - -Query ------ - -- This section will introduce how query data is stored in memory and - fetched by each thread. As mentioned above, each thread group fetches - one query token data, while each thread itself only handles a part of - one query token data. Within each warp, every thread group will fetch - the same query token data, but will multiply it with different key - token data. - - .. code:: cpp - - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - - .. figure:: ../../assets/kernel/query.png - :alt: query - :width: 70% - :align: center - - Query data of one token at one head - -- Each thread defines its own ``q_ptr`` which points to the assigned - query token data on global memory. For example, if ``VEC_SIZE`` is 4 - and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains - total of 128 elements divided into 128 / 4 = 32 vecs. - - .. figure:: ../../assets/kernel/q_vecs.png - :alt: q_vecs - :width: 70% - :align: center - - ``q_vecs`` for one thread group - - .. code:: cpp - - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; - -- Next, we need to read the global memory data pointed to by ``q_ptr`` - into shared memory as ``q_vecs``. It is important to note that each - vecs is assigned to a different row. For example, if the - ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs, - while thread 1 handles the 1st row vecs. By reading the query data in - this way, neighboring threads like thread 0 and thread 1 can read - neighbor memory, achieving the memory coalescing to improve - performance. - -Key ---- - -- Similar to the "Query" section, this section introduces memory layout - and assignment for keys. While each thread group only handle one - query token one kernel run, it may handle multiple key tokens across - multiple iterations. Meanwhile, each warp will process multiple blocks - of key tokens in multiple iterations, ensuring that all context - tokens are processed by the entire thread group after the kernel run. - In this context, "handle" refers to performing the dot multiplication - between query data and key data. - - .. code:: cpp - - const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - -- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different - key token at different iterations. As shown above, that ``k_ptr`` - points to key token data based on ``k_cache`` at assigned block, - assigned head and assigned token. - - .. figure:: ../../assets/kernel/key.png - :alt: key - :width: 70% - :align: center - - Key data of all context tokens at one head - -- The diagram above illustrates the memory layout for key data. It - assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is - 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each - rectangle represents all the elements for one key token at one head, - which will be processed by one thread group. The left half shows the - total 16 blocks of key token data for warp 0, while the right half - represents the remaining key token data for other warps or - iterations. Inside each rectangle, there are a total 32 vecs (128 - elements for one token) that will be processed by 2 threads (one - thread group) separately. - - .. figure:: ../../assets/kernel/k_vecs.png - :alt: k_vecs - :width: 70% - :align: center - - ``k_vecs`` for one thread - - .. code:: cpp - - K_vec k_vecs[NUM_VECS_PER_THREAD] - -- Next, we need to read the key token data from ``k_ptr`` and store - them on register memory as ``k_vecs``. We use register memory for - ``k_vecs`` because it will only be accessed by one thread once, - whereas ``q_vecs`` will be accessed by multiple threads multiple - times. Each ``k_vecs`` will contain multiple vectors for later - calculation. Each vec will be set at each inner iteration. The - assignment of vecs allows neighboring threads in a warp to read - neighboring memory together, which again promotes the memory - coalescing. For instance, thread 0 will read vec 0, while thread 1 - will read vec 1. In the next inner loop, thread 0 will read vec 2, - while thread 1 will read vec 3, and so on. -- You may still be a little confused about the overall flow. Don't - worry, please keep reading the next "QK" section. It will illustrate - the query and key calculation flow in a clearer and higher-level - manner. - -QK ---- - -- As shown the pseudo code below, before the entire for loop block, we - fetch the query data for one token and store it in ``q_vecs``. Then, - in the outer for loop, we iterate through different ``k_ptrs`` that - point to different tokens and prepare the ``k_vecs`` in the inner for - loop. Finally, we perform the dot multiplication between the - ``q_vecs`` and each ``k_vecs``. - - .. code:: cpp - - q_vecs = ... - for ... { - k_ptr = ... - for ... { - k_vecs[i] = ... - } - ... - float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); - } - -- As mentioned before, for each thread, it only fetches part of the - query and key token data at a time. However, there will be a cross - thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk`` - returned here is not just between part of the query and key token dot - multiplication, but actually a full result between entire query and - key token data. -- For example, if the value of ``HEAD_SIZE`` is 128 and - ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain - total 64 elements. However, the returned ``qk`` is actually the - result of dot multiplication between 128 query elements and 128 key - elements. If you want to learn more about the details of the dot - multiplication and reduction, you may refer to the implementation of - ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not - cover it in this document. - -Softmax -------- - -- Next, we need to calculate the normalized softmax for all ``qk``\ s, - as shown above, where each :math:`x` represents a ``qk``. To do this, - we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and - the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction - should be performed across the entire thread block, encompassing - results between the query token and all context key tokens. - - .. math:: - :nowrap: - - \begin{gather*} - m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ - \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} - \end{gather*} - -``qk_max`` and ``logits`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -- Just right after we get the ``qk`` result, we can set the temporary - ``logits`` result with ``qk`` (In the end, the ``logits`` should - store the normalized softmax result). Also we can compare and collect - the ``qk_max`` for all ``qk``\ s that are calculated by current - thread group. - - .. code:: cpp - - if (thread_group_offset == 0) { - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - -- Please note that the ``logits`` here is on shared memory, so each - thread group will set the fields for its own assigned context tokens. - Overall, the size of logits should be number of context tokens. - - .. code:: cpp - - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - -- Then we need to get the reduced ``qk_max`` across each warp. The main - idea is to make threads in warp to communicate with each other and - get the final max ``qk`` . - - .. code:: cpp - - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - -- Finally, we can get the reduced ``qk_max`` from whole thread block by - compare the ``qk_max`` from all warps in this thread block. Then we - need to broadcast the final result to each thread. - -``exp_sum`` -~~~~~~~~~~~ - -- Similar to ``qk_max``, we need to get the reduced sum value from the - entire thread block too. - - .. code:: cpp - - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - ... - exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); - -- Firstly, sum all exp values from each thread group, and meanwhile, - convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``. - Please note, the ``qk_max`` here is already the max ``qk`` across the - whole thread block. And then we can do reduction for ``exp_sum`` - across whole thread block just like the ``qk_max``. - - .. code:: cpp - - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - -- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain - the final normalized softmax result as ``logits``. This ``logits`` - variable will be used for dot multiplication with the value data in - later steps. Now, it should store the normalized softmax result of - ``qk`` for all assigned context tokens. - -Value ------ - -.. figure:: ../../assets/kernel/value.png - :alt: value - :width: 70% - :align: center - - Value data of all context tokens at one head - -.. figure:: ../../assets/kernel/logits_vec.png - :alt: logits_vec - :width: 50% - :align: center - - ``logits_vec`` for one thread - -.. figure:: ../../assets/kernel/v_vec.png - :alt: v_vec - :width: 70% - :align: center - - List of ``v_vec`` for one thread - -- Now we need to retrieve the value data and perform dot multiplication - with ``logits``. Unlike query and key, there is no thread group - concept for value data. As shown in diagram, different from key token - memory layout, elements from the same column correspond to the same - value token. For one block of value data, there are ``HEAD_SIZE`` of - rows and ``BLOCK_SIZE`` of columns that are split into multiple - ``v_vecs``. -- Each thread always fetches ``V_VEC_SIZE`` elements from the same - ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread - retrieves multiple ``v_vec``\ s from different rows and the same - columns through multiple inner iterations. For each ``v_vec``, it - needs to be dot multiplied with the corresponding ``logits_vec``, - which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with - multiple inner iterations, each warp will process one block of value - tokens. And with multiple outer iterations, the whole context value - tokens are processd - - .. code:: cpp - - float accs[NUM_ROWS_PER_THREAD]; - for ... { // Iteration over different blocks. - logits_vec = ... - for ... { // Iteration over different rows. - v_vec = ... - ... - accs[i] += dot(logits_vec, v_vec); - } - } - -- As shown in the above pseudo code, in the outer loop, similar to - ``k_ptr``, ``logits_vec`` iterates over different blocks and reads - ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each - thread reads ``V_VEC_SIZE`` elements from the same tokens as a - ``v_vec`` and performs dot multiplication. It is important to note - that in each inner iteration, the thread fetches different head - position elements for the same tokens. The dot result is then - accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped - to a head position assigned to the current thread. -- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each - thread fetches 8 value elements for 8 tokens at a time. Each element - is from different tokens at the same head position. If ``HEAD_SIZE`` - is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to - fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are - a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle - a whole block of value tokens. And each ``accs`` in each thread - contains 8 elements that accumulated at 8 different head positions. - For the thread 0, the ``accs`` variable will have 8 elements, which - are 0th, 32th … 224th elements of a value head that are accumulated - from all assigned 8 tokens. - -LV ---- -- Now, we need to perform reduction for ``accs`` within each warp. This - process allows each thread to accumulate the ``accs`` for the - assigned head positions of all tokens in one block. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - -- Next, we perform reduction for ``accs`` across all warps, allowing - each thread to have the accumulation of ``accs`` for the assigned - head positions of all context tokens. Please note that each ``accs`` - in every thread only stores the accumulation for a portion of - elements of the entire head for all context tokens. However, overall, - all results for output have been calculated but are just stored in - different thread register memory. - - .. code:: cpp - - float* out_smem = reinterpret_cast(shared_mem); - for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - accs[i] += src[row_idx]; - } - - // Write out the accs. - } - -Output ------- - -- Now we can write all of calculated result from local register memory - to final output global memory. - - .. code:: cpp - - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; - -- First, we need to define the ``out_ptr`` variable, which points to - the start address of the assigned sequence and assigned head. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - -- Finally, we need to iterate over different assigned head positions - and write out the corresponding accumulated result based on the - ``out_ptr``. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md new file mode 100644 index 0000000000000..bcccd284879bb --- /dev/null +++ b/docs/source/design/multimodal/adding_multimodal_plugin.md @@ -0,0 +1,16 @@ +(adding-multimodal-plugin)= + +# Adding a Multimodal Plugin + +This document teaches you how to add a new modality to vLLM. + +Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. +For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. + +The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. + +```{note} +This article is a work in progress. +``` + +% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst deleted file mode 100644 index b726138f840a3..0000000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _adding_multimodal_plugin: - -Adding a Multimodal Plugin -========================== - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. - -.. note:: - This article is a work in progress. - -.. - TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md similarity index 61% rename from docs/source/design/multimodal/multimodal_index.rst rename to docs/source/design/multimodal/multimodal_index.md index c6d47f90b62d5..88af07afc7018 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.md @@ -1,66 +1,83 @@ -.. _multi_modality: +(multi-modality)= -Multi-Modality -============== +# Multi-Modality +```{eval-rst} .. currentmodule:: vllm.multimodal - -vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. +``` -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` -via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following :ref:`this guide `. +by following [this guide](#adding-multimodal-plugin). -Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. +Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). -Guides -++++++ +## Guides -.. toctree:: - :maxdepth: 1 +```{toctree} +:maxdepth: 1 - adding_multimodal_plugin +adding_multimodal_plugin +``` -Module Contents -+++++++++++++++ +## Module Contents +```{eval-rst} .. automodule:: vllm.multimodal +``` -Registry --------- +### Registry +```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalRegistry :members: :show-inheritance: +``` -Base Classes ------------- +### Base Classes +```{eval-rst} .. autodata:: vllm.multimodal.NestedTensors +``` +```{eval-rst} .. autodata:: vllm.multimodal.BatchedTensorInputs +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalDataBuiltins :members: :show-inheritance: +``` +```{eval-rst} .. autodata:: vllm.multimodal.MultiModalDataDict +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalPlugin :members: :show-inheritance: +``` -Image Classes -------------- +### Image Classes +```{eval-rst} .. automodule:: vllm.multimodal.image :members: :show-inheritance: +``` diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index b58456ecc6da8..34564413b34f6 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -2,13 +2,14 @@ ## Debugging -Please see the [Debugging -Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing) +Please see the [Debugging Tips](#debugging-python-multiprocessing) page for information on known issues and how to solve them. ## Introduction -*Note that source code references are to the state of the code at the time of writing in December, 2024.* +```{important} +The source code references are to the state of the code at the time of writing in December, 2024. +``` The use of Python multiprocessing in vLLM is complicated by: @@ -20,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. @@ -82,7 +83,7 @@ There are other miscellaneous places hard-coding the use of `spawn`: Related PRs: -- +- ## Prior State in v1 @@ -96,7 +97,7 @@ engine core. - - -- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45 +- It was off by default for all the reasons mentioned above - compatibility with dependencies and code using vLLM as a library. @@ -119,17 +120,17 @@ instruct users to either add a `__main__` guard or to disable multiprocessing. If that known-failure case occurs, the user will see two messages that explain what is happening. First, a log message from vLLM: -``` - WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously - initialized. We must use the `spawn` multiprocessing start method. Setting - VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing - for more information. +```console +WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. ``` Second, Python itself will raise an exception with a nice explanation: -``` +```console RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase. diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md new file mode 100644 index 0000000000000..79aff757518f2 --- /dev/null +++ b/docs/source/design/plugin_system.md @@ -0,0 +1,54 @@ +(plugin-system)= + +# vLLM's Plugin System + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +## How Plugins Work in vLLM + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. + +## How vLLM Discovers Plugins + +vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +```python +# inside `setup.py` file +from setuptools import setup + +setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + +# inside `vllm_add_dummy_model.py` file +def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") +``` + +For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins. +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. + +## What Can Plugins Do? + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +## Guidelines for Writing Plugins + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +## Compatibility Guarantee + +vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst deleted file mode 100644 index 5a96cc8b3a464..0000000000000 --- a/docs/source/design/plugin_system.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _plugin_system: - -vLLM's Plugin System -==================== - -The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. - -How Plugins Work in vLLM ------------------------- - -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. - -How vLLM Discovers Plugins --------------------------- - -vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: - -.. code-block:: python - - # inside `setup.py` file - from setuptools import setup - - setup(name='vllm_add_dummy_model', - version='0.1', - packages=['vllm_add_dummy_model'], - entry_points={ - 'vllm.general_plugins': - ["register_dummy_model = vllm_add_dummy_model:register"] - }) - - # inside `vllm_add_dummy_model.py` file - def register(): - from vllm import ModelRegistry - - if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model("MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava") - -For more information on adding entry points to your package, please check the `official documentation `__. - -Every plugin has three parts: - -1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. - -2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. - -3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. - -What Can Plugins Do? --------------------- - -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. - -Guidelines for Writing Plugins ------------------------------- - -- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. - -Compatibility Guarantee ------------------------ - -vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md similarity index 59% rename from docs/source/dev/engine/async_llm_engine.rst rename to docs/source/dev/engine/async_llm_engine.md index 93fc310cb543b..904feaa505164 100644 --- a/docs/source/dev/engine/async_llm_engine.rst +++ b/docs/source/dev/engine/async_llm_engine.md @@ -1,6 +1,7 @@ -AsyncLLMEngine -================================= +# AsyncLLMEngine +```{eval-rst} .. autoclass:: vllm.AsyncLLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md new file mode 100644 index 0000000000000..701cb95d3be33 --- /dev/null +++ b/docs/source/dev/engine/engine_index.md @@ -0,0 +1,17 @@ +# vLLM Engine + +```{eval-rst} +.. automodule:: vllm.engine +``` + +```{eval-rst} +.. currentmodule:: vllm.engine +``` + +```{toctree} +:caption: Engines +:maxdepth: 2 + +llm_engine +async_llm_engine +``` diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst deleted file mode 100644 index ba9ae55ddea46..0000000000000 --- a/docs/source/dev/engine/engine_index.rst +++ /dev/null @@ -1,13 +0,0 @@ -vLLM Engine -================================= - -.. automodule:: vllm.engine -.. currentmodule:: vllm.engine - -.. toctree:: - :maxdepth: 2 - :caption: Engines - - llm_engine - async_llm_engine - diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md similarity index 60% rename from docs/source/dev/engine/llm_engine.rst rename to docs/source/dev/engine/llm_engine.md index 0b8c1e219d7c9..d6613ef5562dc 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.md @@ -1,6 +1,7 @@ -LLMEngine -================================= +# LLMEngine +```{eval-rst} .. autoclass:: vllm.LLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md similarity index 67% rename from docs/source/dev/offline_inference/llm.rst rename to docs/source/dev/offline_inference/llm.md index 83ba1b6987c6d..9f129d5e41686 100644 --- a/docs/source/dev/offline_inference/llm.rst +++ b/docs/source/dev/offline_inference/llm.md @@ -1,6 +1,7 @@ -LLM Class -========= +# LLM Class +```{eval-rst} .. autoclass:: vllm.LLM :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md similarity index 78% rename from docs/source/dev/offline_inference/llm_inputs.rst rename to docs/source/dev/offline_inference/llm_inputs.md index 0d47281db485e..21f688a12c536 100644 --- a/docs/source/dev/offline_inference/llm_inputs.rst +++ b/docs/source/dev/offline_inference/llm_inputs.md @@ -1,14 +1,19 @@ -LLM Inputs -========== +# LLM Inputs +```{eval-rst} .. autodata:: vllm.inputs.PromptType +``` +```{eval-rst} .. autoclass:: vllm.inputs.TextPrompt :show-inheritance: :members: :member-order: bysource +``` +```{eval-rst} .. autoclass:: vllm.inputs.TokensPrompt :show-inheritance: :members: :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md new file mode 100644 index 0000000000000..318a02d8c78df --- /dev/null +++ b/docs/source/dev/offline_inference/offline_index.md @@ -0,0 +1,8 @@ +# Offline Inference + +```{toctree} +:maxdepth: 1 + +llm +llm_inputs +``` diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst deleted file mode 100644 index 27dfb0e9df90e..0000000000000 --- a/docs/source/dev/offline_inference/offline_index.rst +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference -================================= - -.. toctree:: - :maxdepth: 1 - - llm - llm_inputs diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md similarity index 55% rename from docs/source/dev/pooling_params.rst rename to docs/source/dev/pooling_params.md index 334e0287aff09..74b2c57443e4b 100644 --- a/docs/source/dev/pooling_params.rst +++ b/docs/source/dev/pooling_params.md @@ -1,5 +1,6 @@ -Pooling Parameters -================== +# Pooling Parameters +```{eval-rst} .. autoclass:: vllm.PoolingParams :members: +``` diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md similarity index 55% rename from docs/source/dev/sampling_params.rst rename to docs/source/dev/sampling_params.md index f645941a6c022..bdc36af5153db 100644 --- a/docs/source/dev/sampling_params.rst +++ b/docs/source/dev/sampling_params.md @@ -1,5 +1,6 @@ -Sampling Parameters -=================== +# Sampling Parameters +```{eval-rst} .. autoclass:: vllm.SamplingParams :members: +``` diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 79b49a186236a..aef32f7559f74 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -15,18 +15,12 @@ def fix_case(text: str) -> str: return text -def underline(title: str, character: str = "=") -> str: - return f"{title}\n{character * len(title)}" - - def generate_title(filename: str) -> str: # Turn filename into a title title = filename.replace("_", " ").title() # Handle acronyms and names title = fix_case(title) - # Underline title - title = underline(title) - return title + return f"# {title}" def generate_examples(): @@ -38,24 +32,23 @@ def generate_examples(): # Destination paths doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths] + doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] # Generate the example docs for each example script for script_path, doc_path in zip(script_paths, doc_paths): - script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}" # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source {script_url}.\n\n" - f".. literalinclude:: {include_path}\n" - " :language: python\n" - " :linenos:\n") + f"Source: .\n\n" + f"```{{literalinclude}} {include_path}\n" + ":language: python\n" + ":linenos:\n```") with open(doc_path, "w+") as f: f.write(content) # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.rst") as f: + with open(doc_dir / "examples_index.template.md") as f: examples_index = f.read() - with open(doc_dir / "examples_index.rst", "w+") as f: - example_docs = "\n ".join(path.stem for path in script_paths) + with open(doc_dir / "examples_index.md", "w+") as f: + example_docs = "\n".join(path.stem + ".md" for path in script_paths) f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md new file mode 100644 index 0000000000000..6d01efbbf8828 --- /dev/null +++ b/docs/source/getting_started/amd-installation.md @@ -0,0 +1,163 @@ +(installation-rocm)= + +# Installation with ROCm + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +Installation options: + +1. [Build from source with docker](#build-from-source-docker-rocm) +2. [Build from source](#build-from-source-rocm) + +(build-from-source-docker-rocm)= + +## Option 1: Build from source with docker (recommended) + +You can build and install vLLM from source. + +First, build a docker image from and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. +- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. +- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` +- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: + +```console +$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +$ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +(build-from-source-rocm)= + +## Option 2: Build from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + +Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + +```console +$ python3 -m pip install ninja cmake wheel pybind11 +$ pip uninstall -y triton +$ git clone https://github.com/OpenAI/triton.git +$ cd triton +$ git checkout e192dba +$ cd python +$ pip3 install . +$ cd ../.. +``` + +```{note} +- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. +``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + +Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) +Alternatively, wheels intended for vLLM use can be accessed under the releases. + +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + +```console +$ git clone https://github.com/ROCm/flash-attention.git +$ cd flash-attention +$ git checkout 3cea2fb +$ git submodule update --init +$ GPU_ARCHS="gfx90a" python3 setup.py install +$ cd .. +``` + +```{note} +- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) +``` + +3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: + +```bash +$ pip install --upgrade pip + +# Install PyTorch +$ pip uninstall torch -y +$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + +# Build & install AMD SMI +$ pip install /opt/rocm/share/amd_smi + +# Install dependencies +$ pip install --upgrade numba scipy huggingface-hub[cli] +$ pip install "numpy<2" +$ pip install -r requirements-rocm.txt + +# Build vLLM for MI210/MI250/MI300. +$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" +$ python3 setup.py develop +``` + +This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation. + +```{tip} +- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. +- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. +- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. +- The ROCm version of PyTorch, ideally, should match the ROCm driver version. +``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst deleted file mode 100644 index ece5d785e0c65..0000000000000 --- a/docs/source/getting_started/amd-installation.rst +++ /dev/null @@ -1,178 +0,0 @@ -.. _installation_rocm: - -Installation with ROCm -====================== - -vLLM supports AMD GPUs with ROCm 6.2. - -Requirements ------------- - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -* ROCm 6.2 - -Installation options: - -#. :ref:`Build from source with docker ` -#. :ref:`Build from source ` - -.. _build_from_source_docker_rocm: - -Option 1: Build from source with docker (recommended) ------------------------------------------------------ - -You can build and install vLLM from source. - -First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -.. code-block:: console - - { - "features": { - "buildkit": true - } - } - - -`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. -* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target. -* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c` -* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running ``docker build`` with ``--build-arg`` options. - - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . - -To run the above docker image ``vllm-rocm``, use the below command: - -.. code-block:: console - - $ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - - -.. _build_from_source_rocm: - -Option 2: Build from source ---------------------------- - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- `ROCm `_ -- `PyTorch `_ - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_ - - -1. Install `Triton flash attention for ROCm `_ - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_ - - .. code-block:: console - - $ python3 -m pip install ninja cmake wheel pybind11 - $ pip uninstall -y triton - $ git clone https://github.com/OpenAI/triton.git - $ cd triton - $ git checkout e192dba - $ cd python - $ pip3 install . - $ cd ../.. - -.. note:: - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - - -2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_ - - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_ -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. -Note to get your gfx architecture, run `rocminfo |grep gfx`. - - .. code-block:: console - - $ git clone https://github.com/ROCm/flash-attention.git - $ cd flash-attention - $ git checkout 3cea2fb - $ git submodule update --init - $ GPU_ARCHS="gfx90a" python3 setup.py install - $ cd .. - -.. note:: - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - -3. Build vLLM. - - For example, vLLM on ROCM 6.2 can be built with the following steps: - - .. code-block:: console - - $ pip install --upgrade pip - - $ # Install PyTorch - $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - - $ # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi - - $ # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli] - $ pip install "numpy<2" - $ pip install -r requirements-rocm.txt - - $ # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop - - - This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation. - - -.. tip:: - - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - - -.. tip:: - - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to `vLLM performance optimization `_. - - diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md new file mode 100644 index 0000000000000..de807e198b4f6 --- /dev/null +++ b/docs/source/getting_started/arm-installation.md @@ -0,0 +1,46 @@ +(installation-arm)= + +# Installation for ARM CPUs + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: + +- CPU backend inference capabilities +- Relevant runtime environment variables +- Performance optimization tips + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. +Contents: + +1. [Requirements](#arm-backend-requirements) +2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) +3. [Building from Source](#build-arm-backend-from-source) + +(arm-backend-requirements)= + +## Requirements + +- **Operating System**: Linux or macOS +- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +- **Instruction Set Architecture (ISA)**: NEON support is required + +(arm-backend-quick-start-dockerfile)= + +## Quick Start with Dockerfile + +You can quickly set up vLLM on ARM using Docker: + +```console +$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-arm-backend-from-source)= + +## Building from Source + +To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst deleted file mode 100644 index 7b457df92c11d..0000000000000 --- a/docs/source/getting_started/arm-installation.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. _installation_arm: - -Installation for ARM CPUs -========================= - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: - -* CPU backend inference capabilities -* Relevant runtime environment variables -* Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. :ref:`Requirements ` -2. :ref:`Quick Start with Dockerfile ` -3. :ref:`Building from Source ` - -.. _arm_backend_requirements: - -Requirements ------------- - -* **Operating System**: Linux or macOS -* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) -* **Instruction Set Architecture (ISA)**: NEON support is required - -.. _arm_backend_quick_start_dockerfile: - -Quick Start with Dockerfile ---------------------------- - -You can quickly set up vLLM on ARM using Docker: - -.. code-block:: console - - $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_arm_backend_from_source: - -Building from Source --------------------- - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md new file mode 100644 index 0000000000000..b6f181ace6274 --- /dev/null +++ b/docs/source/getting_started/cpu-installation.md @@ -0,0 +1,154 @@ +(installation-cpu)= + +# Installation with CPU + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) + +Table of contents: + +1. [Requirements](#cpu-backend-requirements) +2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) +3. [Build from source](#build-cpu-backend-from-source) +4. [Related runtime environment variables](#env-intro) +5. [Intel Extension for PyTorch](#ipex-guidance) +6. [Performance tips](#cpu-backend-performance-tips) + +(cpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Compiler: gcc/g++>=12.3.0 (optional, recommended) +- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) + +(cpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-cpu-backend-from-source)= + +## Build from source + +- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get update -y +$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev +$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +- Second, install Python packages for vLLM CPU backend building: + +```console +$ pip install --upgrade pip +$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +- Finally, build and install vLLM CPU backend: + +```console +$ VLLM_TARGET_DEVICE=cpu python setup.py install +``` + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +``` + +(env-intro)= + +## Related runtime environment variables + +- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + +(ipex-guidance)= + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +(cpu-backend-performance-tips)= + +## Performance tips + +- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +$ find / -name *libtcmalloc* # find the dynamic link library path +$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +$ python examples/offline_inference.py # run vLLM +``` + +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: + +```console +$ export VLLM_CPU_KVCACHE_SPACE=40 +$ export VLLM_CPU_OMP_THREADS_BIND=0-29 +$ vllm serve facebook/opt-125m +``` + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +```console +$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + +# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. +CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ +0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 +8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + +# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 +$ export VLLM_CPU_OMP_THREADS_BIND=0-7 +$ python examples/offline_inference.py +``` + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. + +## CPU Backend Considerations + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + ```console + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + ``` + + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst deleted file mode 100644 index 649de1cd9b53c..0000000000000 --- a/docs/source/getting_started/cpu-installation.rst +++ /dev/null @@ -1,164 +0,0 @@ -.. _installation_cpu: - -Installation with CPU -======================== - -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - -- Tensor Parallel -- Model Quantization (``INT8 W8A8, AWQ``) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` -#. :ref:`Related runtime environment variables ` -#. :ref:`Intel Extension for PyTorch ` -#. :ref:`Performance tips ` - -.. _cpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Compiler: gcc/g++>=12.3.0 (optional, recommended) -* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) - -.. _cpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_cpu_backend_from_source: - -Build from source ------------------ - -- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev - $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -- Second, install Python packages for vLLM CPU backend building: - -.. code-block:: console - - $ pip install --upgrade pip - $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy - $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, build and install vLLM CPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=cpu python setup.py install - -.. note:: - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - - - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. - -.. _env_intro: - -Related runtime environment variables -------------------------------------- - -- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. - -.. _ipex_guidance: - -Intel Extension for PyTorch ---------------------------- - -- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. - -.. _cpu_backend_performance_tips: - -Performance tips ------------------ - -- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library - $ find / -name *libtcmalloc* # find the dynamic link library path - $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD - $ python examples/offline_inference.py # run vLLM - -- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - -.. code-block:: console - - $ export VLLM_CPU_KVCACHE_SPACE=40 - $ export VLLM_CPU_OMP_THREADS_BIND=0-29 - $ vllm serve facebook/opt-125m - -- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - -.. code-block:: console - - $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores - - # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. - CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ - 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - - # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 - $ export VLLM_CPU_OMP_THREADS_BIND=0-7 - $ python examples/offline_inference.py - -- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. - -CPU Backend Considerations --------------------------- - -- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - -- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. - -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - - * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: - - .. code-block:: console - - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp - - - * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md new file mode 100644 index 0000000000000..3b0029f2e88ce --- /dev/null +++ b/docs/source/getting_started/debugging.md @@ -0,0 +1,200 @@ +(debugging)= + +# Debugging Tips + +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. + +```{note} +Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. +``` + +## Hangs downloading a model + +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue. + +## Hangs loading a model from disk + +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. + +```{note} +To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. +``` + +## Model is too large + +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +## Enable more logging + +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: + +- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. +- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. +- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. +- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. + +## Incorrect network setup + +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=`. + +You might also need to set `export NCCL_SOCKET_IFNAME=` and `export GLOO_SOCKET_IFNAME=` to specify the network interface for the IP address. + +## Error near `self.graph.replay()` + +If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +## Incorrect hardware/driver + +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. + +```python +# Test PyTorch NCCL +import torch +import torch.distributed as dist +dist.init_process_group(backend="nccl") +local_rank = dist.get_rank() % torch.cuda.device_count() +torch.cuda.set_device(local_rank) +data = torch.FloatTensor([1,] * 128).to("cuda") +dist.all_reduce(data, op=dist.ReduceOp.SUM) +torch.cuda.synchronize() +value = data.mean().item() +world_size = dist.get_world_size() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch NCCL is successful!") + +# Test PyTorch GLOO +gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") +cpu_data = torch.FloatTensor([1,] * 128) +dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) +value = cpu_data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch GLOO is successful!") + +if world_size <= 1: + exit() + +# Test vLLM NCCL, with cuda graph +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + +pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) +# pynccl is enabled by default for 0.6.5+, +# but for 0.6.4 and below, we need to enable it manually. +# keep the code for backward compatibility when because people +# prefer to read the latest documentation. +pynccl.disabled = False + +s = torch.cuda.Stream() +with torch.cuda.stream(s): + data.fill_(1) + pynccl.all_reduce(data, stream=s) + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL is successful!") + +g = torch.cuda.CUDAGraph() +with torch.cuda.graph(cuda_graph=g, stream=s): + pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + +data.fill_(1) +g.replay() +torch.cuda.current_stream().synchronize() +value = data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL with cuda graph is successful!") + +dist.destroy_process_group(gloo_group) +dist.destroy_process_group() +``` + +If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: + +```console +$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +``` + +If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: + +```console +$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +``` + +If the script runs successfully, you should see the message `sanity check is successful!`. + +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + +```{note} +A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: + +- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. +- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. + +Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. +``` + +(debugging-python-multiprocessing)= +## Python multiprocessing + +### `RuntimeError` Exception + +If you have seen a warning in your logs like this: + +```console +WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. +``` + +or an error from Python that looks like this: + +```console +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html +``` + +then you must update your Python code to guard usage of `vllm` behind a `if +__name__ == '__main__':` block. For example, instead of this: + +```python +import vllm + +llm = vllm.LLM(...) +``` + +try this instead: + +```python +if __name__ == '__main__': + import vllm + + llm = vllm.LLM(...) +``` + +## Known Issues + +- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). +- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst deleted file mode 100644 index 7f36d65a227f0..0000000000000 --- a/docs/source/getting_started/debugging.rst +++ /dev/null @@ -1,202 +0,0 @@ -.. _debugging: - -=============== -Debugging Tips -=============== - -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. - -.. note:: - - Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. - -Hangs downloading a model ----------------------------------------- -If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. -It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. - -Hangs loading a model from disk ----------------------------------------- -If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. -It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. - -.. note:: - - To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. - -Model is too large ----------------------------------------- -If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. - -Enable more logging ----------------------------------------- -If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: - -- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. -- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. -- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. - -Incorrect network setup ----------------------------------------- -The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. -If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. - -You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. - -Error near ``self.graph.replay()`` ----------------------------------------- -If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. -To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. - -Incorrect hardware/driver ----------------------------------------- -If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. - -.. code-block:: python - - # Test PyTorch NCCL - import torch - import torch.distributed as dist - dist.init_process_group(backend="nccl") - local_rank = dist.get_rank() % torch.cuda.device_count() - torch.cuda.set_device(local_rank) - data = torch.FloatTensor([1,] * 128).to("cuda") - dist.all_reduce(data, op=dist.ReduceOp.SUM) - torch.cuda.synchronize() - value = data.mean().item() - world_size = dist.get_world_size() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch NCCL is successful!") - - # Test PyTorch GLOO - gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") - cpu_data = torch.FloatTensor([1,] * 128) - dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) - value = cpu_data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch GLOO is successful!") - - if world_size <= 1: - exit() - - # Test vLLM NCCL, with cuda graph - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - - pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) - # pynccl is enabled by default for 0.6.5+, - # but for 0.6.4 and below, we need to enable it manually. - # keep the code for backward compatibility when because people - # prefer to read the latest documentation. - pynccl.disabled = False - - s = torch.cuda.Stream() - with torch.cuda.stream(s): - data.fill_(1) - pynccl.all_reduce(data, stream=s) - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL is successful!") - - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(cuda_graph=g, stream=s): - pynccl.all_reduce(data, stream=torch.cuda.current_stream()) - - data.fill_(1) - g.replay() - torch.cuda.current_stream().synchronize() - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL with cuda graph is successful!") - - dist.destroy_process_group(gloo_group) - dist.destroy_process_group() - -If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py - -If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py - -If the script runs successfully, you should see the message ``sanity check is successful!``. - -If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. - -.. note:: - - A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: - - - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. - - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. - - Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. - -Python multiprocessing ----------------------- - -`RuntimeError` Exception -^^^^^^^^^^^^^^^^^^^^^^^^ - -If you have seen a warning in your logs like this: - -.. code-block:: console - - WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously - initialized. We must use the `spawn` multiprocessing start method. Setting - VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing - for more information. - -or an error from Python that looks like this: - -.. code-block:: console - - RuntimeError: - An attempt has been made to start a new process before the - current process has finished its bootstrapping phase. - - This probably means that you are not using fork to start your - child processes and you have forgotten to use the proper idiom - in the main module: - - if __name__ == '__main__': - freeze_support() - ... - - The "freeze_support()" line can be omitted if the program - is not going to be frozen to produce an executable. - - To fix this issue, refer to the "Safe importing of main module" - section in https://docs.python.org/3/library/multiprocessing.html - -then you must update your Python code to guard usage of ``vllm`` behind a ``if -__name__ == '__main__':`` block. For example, instead of this: - -.. code-block:: python - - import vllm - - llm = vllm.LLM(...) - -try this instead: - -.. code-block:: python - - if __name__ == '__main__': - import vllm - - llm = vllm.LLM(...) - -Known Issues ----------------------------------------- -- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md new file mode 100644 index 0000000000000..de7a91c0ffa48 --- /dev/null +++ b/docs/source/getting_started/examples/examples_index.template.md @@ -0,0 +1,8 @@ +# Examples + +```{toctree} +:maxdepth: 1 +:caption: Scripts + +%EXAMPLE_DOCS% +``` \ No newline at end of file diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst deleted file mode 100644 index 1b34cccbae15a..0000000000000 --- a/docs/source/getting_started/examples/examples_index.template.rst +++ /dev/null @@ -1,8 +0,0 @@ -Examples -================================= - -.. toctree:: - :maxdepth: 1 - :caption: Scripts - - %EXAMPLE_DOCS% diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md new file mode 100644 index 0000000000000..acf42f210dffb --- /dev/null +++ b/docs/source/getting_started/gaudi-installation.md @@ -0,0 +1,386 @@ +# Installation with Intel® Gaudi® AI Accelerators + +This README provides instructions on running vLLM with Intel Gaudi devices. + +## Requirements and Installation + +Please follow the instructions provided in the [Gaudi Installation +Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the [Optimizing Training Platform +Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + +### Requirements + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + +### Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +### Build from source + +#### Environment verification + +To verify that the Intel Gaudi software was correctly installed, run: + +```console +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural_compressor is installed +``` + +Refer to [Intel Gaudi Software Stack +Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) +for more details. + +#### Run Docker Image + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the [Intel Gaudi +documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) +for more details. + +Use the following commands to run a Docker image: + +```console +$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +``` + +#### Build and Install vLLM + +To build and install vLLM from source, run: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python setup.py develop +``` + +Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + +```console +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout habana_main +$ python setup.py develop +``` + +## Supported Features + +- [Offline batched inference](#offline-batched-inference) +- Online inference via [OpenAI-Compatible Server](#openai-compatible-server) +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +## Unsupported Features + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +## Supported Configurations + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +## Performance Tuning + +### Execution modes + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. + +```{eval-rst} +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode +``` + +```{warning} +In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. +``` + +### Bucketing mechanism + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. + +```{note} +Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. +``` + +Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +``` +INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +``` + +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +``` +min = 2, step = 32, max = 64 +=> ramp_up = (2, 4, 8, 16) +=> stable = (32, 64) +=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) +``` + +Example (without ramp-up) + +``` +min = 128, step = 128, max = 512 +=> ramp_up = () +=> stable = (128, 256, 384, 512) +=> buckets = ramp_up + stable => (128, 256, 384, 512) +``` + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +```{warning} +If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. +``` + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. + +```{note} +Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +``` + +### Warmup + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +``` +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +... +INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +... +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +``` + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +```{tip} +Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. +``` + +### HPU Graph capture + +[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +```{note} +`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. +``` + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode +\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. + +```{note} +`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. +``` + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +``` +INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +... +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +... +INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +... +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +``` + +### Recommended vLLM Parameters + +- We recommend running inference on Gaudi 2 with `block_size` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see [Gaudi + Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +### Environment variables + +**Diagnostic and profiling knobs:** + +- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default + +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default + +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default + +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default + +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default + +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - `{phase}` is either `PROMPT` or `DECODE` + + - `{dim}` is either `BS`, `SEQ` or `BLOCK` + + - `{param}` is either `MIN`, `STEP` or `MAX` + + - Default values: + + - Prompt: + : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - Decode: + : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default +- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs + +## Troubleshooting: Tweaking HPU Graphs + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak `gpu_memory_utilization` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default `gpu_memory_utilization` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. +- If this method is not efficient, you can disable `HPUGraph` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding `--enforce-eager` flag to + server (for online inference), or by passing `enforce_eager=True` + argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst deleted file mode 100644 index 249e08278ff8f..0000000000000 --- a/docs/source/getting_started/gaudi-installation.rst +++ /dev/null @@ -1,402 +0,0 @@ -Installation with Intel® Gaudi® AI Accelerators -=============================================== - -This README provides instructions on running vLLM with Intel Gaudi devices. - -Requirements and Installation ------------------------------ - -Please follow the instructions provided in the `Gaudi Installation -Guide `__ -to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the `Optimizing Training Platform -Guide `__. - -Requirements -~~~~~~~~~~~~ - -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - - -Quick start using Dockerfile -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code:: console - - $ docker build -f Dockerfile.hpu -t vllm-hpu-env . - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env - - -.. tip:: - If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. - - -Build from source -~~~~~~~~~~~~~~~~~ - -Environment verification -^^^^^^^^^^^^^^^^^^^^^^^^ - -To verify that the Intel Gaudi software was correctly installed, run: - -.. code:: console - - $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible - $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed - $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed - $ pip list | grep neural # verify that neural_compressor is installed - -Refer to `Intel Gaudi Software Stack -Verification `__ -for more details. - -Run Docker Image -^^^^^^^^^^^^^^^^ - -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the `Intel Gaudi -documentation `__ -for more details. - -Use the following commands to run a Docker image: - -.. code:: console - - $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - -Build and Install vLLM -^^^^^^^^^^^^^^^^^^^^^^ - -To build and install vLLM from source, run: - -.. code:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python setup.py develop - - -Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: - -.. code:: console - - $ git clone https://github.com/HabanaAI/vllm-fork.git - $ cd vllm-fork - $ git checkout habana_main - $ python setup.py develop - - -Supported Features ------------------- - -- `Offline batched - inference `__ -- Online inference via `OpenAI-Compatible - Server `__ -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with `HPU Graphs `__ - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) - -Unsupported Features --------------------- - -- Beam search -- LoRA adapters -- Quantization -- Prefill chunking (mixed-batch inferencing) - -Supported Configurations ------------------------- - -The following configurations have been validated to be function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -- `meta-llama/Llama-2-7b `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-7b-chat-hf `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-70b `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Llama-2-70b-chat-hf `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - -Performance Tuning ------------------- - -Execution modes -~~~~~~~~~~~~~~~ - -Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. - -.. list-table:: vLLM execution modes - :widths: 25 25 50 - :header-rows: 1 - - * - ``PT_HPU_LAZY_MODE`` - - ``enforce_eager`` - - execution mode - * - 0 - - 0 - - torch.compile - * - 0 - - 1 - - PyTorch eager mode - * - 1 - - 0 - - HPU Graphs - * - 1 - - 1 - - PyTorch lazy mode - -.. warning:: - In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. - - -Bucketing mechanism -~~~~~~~~~~~~~~~~~~~ - -Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. - -.. note:: - Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. - -Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: - -.. code-block:: - - INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - -``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. - -Example (with ramp-up) - -.. code-block:: - - min = 2, step = 32, max = 64 - => ramp_up = (2, 4, 8, 16) - => stable = (32, 64) - => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) - -Example (without ramp-up) - -.. code-block:: - - min = 128, step = 128, max = 512 - => ramp_up = () - => stable = (128, 256, 384, 512) - => buckets = ramp_up + stable => (128, 256, 384, 512) - - -In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. - -.. warning:: - If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. - -As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. - -.. note:: - Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. - -Warmup -~~~~~~ - -Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: - -.. code-block:: - - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB - INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB - ... - INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB - INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB - ... - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - -This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - -.. tip:: - Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. - -HPU Graph capture -~~~~~~~~~~~~~~~~~ - -`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. - - -When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). -Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. -Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. -Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. -Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. - -.. note:: - ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. - -User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: -- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode -- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt - -When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. - - -.. note:: - ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. - - -Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): - -.. code-block:: - - INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache - INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 - INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB - ... - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - ... - INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB - INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB - ... - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory - INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) - - -Recommended vLLM Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- We recommend running inference on Gaudi 2 with ``block_size`` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see `Gaudi - Architecture `__). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. - -Environment variables -~~~~~~~~~~~~~~~~~~~~~ - -**Diagnostic and profiling knobs:** - -- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. - -**Performance tuning knobs:** - -- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default -- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default -- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default -- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default -- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default -- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism - - - ``{phase}`` is either ``PROMPT`` or ``DECODE`` - - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` - - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` - - Default values: - - - Prompt: - - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` - - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` - - - Decode: - - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` - - -Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - -- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default -- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs - -Troubleshooting: Tweaking HPU Graphs ------------------------------------- - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak ``gpu_memory_utilization`` knob. It will decrease the - allocation of KV cache, leaving some headroom for capturing graphs - with larger batch size. By default ``gpu_memory_utilization`` is set - to 0.9. It attempts to allocate ~90% of HBM left for KV cache after - short profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. - -- If this method is not efficient, you can disable ``HPUGraph`` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding ``--enforce-eager`` flag to - server (for online inference), or by passing ``enforce_eager=True`` - argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md new file mode 100644 index 0000000000000..996fb346f43d4 --- /dev/null +++ b/docs/source/getting_started/installation.md @@ -0,0 +1,199 @@ +(installation)= + +# Installation + +vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Install released versions + +You can install vLLM using pip: + +```console +$ # (Recommended) Create a new conda environment. +$ conda create -n myenv python=3.12 -y +$ conda activate myenv + +$ # Install vLLM with CUDA 12.1. +$ pip install vllm +``` + +```{note} +Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details. +``` + +````{note} +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. +We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: + +```console +$ # Install vLLM with CUDA 11.8. +$ export VLLM_VERSION=0.6.1.post1 +$ export PYTHON_VERSION=310 +$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + +Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. +```` + +(install-the-latest-code)= + +## Install the latest code + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: + +```console +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +If you want to access the wheels for previous commits, you can specify the commit hash in the URL: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + +Another way to access the latest code is to use the docker images: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +(build-from-source)= + +## Build from source + +(python-only-build)= + +### Python-only build (without compilation) + +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_USE_PRECOMPILED=1 pip install --editable . +``` + +This will download the latest nightly wheel and use the compiled libraries from there in the install. + +The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): + +```console +$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +$ pip install --editable . +``` + +You can find more information about vLLM's wheels [above](#install-the-latest-code). + +```{note} +There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +``` + +### Full build (with compilation) + +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -e . +``` + +```{tip} +Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + +For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . +As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + +[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. +The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. +``` + +#### Use an existing PyTorch installation + +There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: + +- Building vLLM with PyTorch nightly or a custom PyTorch build. +- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it. + +To build vLLM using an existing PyTorch installation: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python use_existing_torch.py +$ pip install -r requirements-build.txt +$ pip install -e . --no-build-isolation +``` + +#### Use the local cutlass for compilation + +Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. +To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +``` + +#### Troubleshooting + +To avoid your system being overloaded, you can limit the number of compilation jobs +to be run simultaneously, via the environment variable `MAX_JOBS`. For example: + +```console +$ export MAX_JOBS=6 +$ pip install -e . +``` + +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. + +Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. + +```console +$ # Use `--ipc=host` to make sure the shared memory is large enough. +$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +``` + +If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: + +```console +$ export CUDA_HOME=/usr/local/cuda +$ export PATH="${CUDA_HOME}/bin:$PATH" +``` + +Here is a sanity check to verify that the CUDA Toolkit is correctly installed: + +```console +$ nvcc --version # verify that nvcc is in your PATH +$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +``` + +### Unsupported OS build + +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. + +Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: + +```console +$ export VLLM_TARGET_DEVICE=empty +$ pip install -e . +``` diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst deleted file mode 100644 index 9b6cb0e80d60e..0000000000000 --- a/docs/source/getting_started/installation.rst +++ /dev/null @@ -1,214 +0,0 @@ -.. _installation: - -============ -Installation -============ - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. - -Requirements -============ - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Install released versions -========================= - -You can install vLLM using pip: - -.. code-block:: console - - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.12 -y - $ conda activate myenv - - $ # Install vLLM with CUDA 12.1. - $ pip install vllm - -.. note:: - - Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue `_ for more details. - -.. note:: - - As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. - We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: - - .. code-block:: console - - $ # Install vLLM with CUDA 11.8. - $ export VLLM_VERSION=0.6.1.post1 - $ export PYTHON_VERSION=310 - $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 - - In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. - - Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. - - -.. _install-the-latest-code: - -Install the latest code -======================= - -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: - -.. code-block:: console - - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - -Another way to access the latest code is to use the docker images: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} - -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. - -The latest code can contain bugs and may not be stable. Please use it with caution. - -.. _build_from_source: - -Build from source -================= - -.. _python-only-build: - -Python-only build (without compilation) ---------------------------------------- - -If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_USE_PRECOMPILED=1 pip install --editable . - -This will download the latest nightly wheel and use the compiled libraries from there in the install. - -The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: - -.. code-block:: console - - $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl - $ pip install --editable . - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -.. note:: - - There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. - -Full build (with compilation) ------------------------------ - -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -e . - -.. tip:: - - Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . - As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. - - `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. - The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. - - -Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: - -* Building vLLM with PyTorch nightly or a custom PyTorch build. -* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it. - -To build vLLM using an existing PyTorch installation: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python use_existing_torch.py - $ pip install -r requirements-build.txt - $ pip install -e . --no-build-isolation - - -Use the local cutlass for compilation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. -To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . - - -Troubleshooting -~~~~~~~~~~~~~~~ - -To avoid your system being overloaded, you can limit the number of compilation jobs -to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: - -.. code-block:: console - - $ export MAX_JOBS=6 - $ pip install -e . - -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. - -Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. - -.. code-block:: console - - $ # Use `--ipc=host` to make sure the shared memory is large enough. - $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 - -If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: - -.. code-block:: console - - $ export CUDA_HOME=/usr/local/cuda - $ export PATH="${CUDA_HOME}/bin:$PATH" - -Here is a sanity check to verify that the CUDA Toolkit is correctly installed: - -.. code-block:: console - - $ nvcc --version # verify that nvcc is in your PATH - $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME - - -Unsupported OS build --------------------- - -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. - -Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: - -.. code-block:: console - - $ export VLLM_TARGET_DEVICE=empty - $ pip install -e . diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md new file mode 100644 index 0000000000000..d6de5760cc82c --- /dev/null +++ b/docs/source/getting_started/neuron-installation.md @@ -0,0 +1,132 @@ +(installation-neuron)= + +# Installation with Neuron + +vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. +Paged Attention and Chunked Prefill are currently in development and will be available soon. +Data types currently supported in Neuron SDK are FP16 and BF16. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.11 +- Accelerator: NeuronCore_v2 (in trn1/inf2 instances) +- Pytorch 2.0.1/2.1.1 +- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) + +Installation steps: + +- [Build from source](#build-from-source-neuron) + + - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) + - [Step 1. Install drivers and tools](#install-drivers) + - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) + - [Step 3. Install vLLM from source](#install-vllm) + +(build-from-source-neuron)= + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +## Build from source + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +(launch-instances)= + +### Step 0. Launch Trn1/Inf2 instances + +Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). + +- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. +- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/) +- Select Ubuntu Server 22.04 TLS AMI +- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. +- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance + +(install-drivers)= + +### Step 1. Install drivers and tools + +The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: + +```console +# Configure Linux for Neuron repository updates +. /etc/os-release +sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <` - - - :ref:`Step 0. Launch Trn1/Inf2 instances ` - - :ref:`Step 1. Install drivers and tools ` - - :ref:`Step 2. Install transformers-neuronx and its dependencies ` - - :ref:`Step 3. Install vLLM from source ` - -.. _build_from_source_neuron: - -.. note:: - - The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. - -Build from source ------------------ - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -.. _launch_instances: - -Step 0. Launch Trn1/Inf2 instances -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_. - -- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. -- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_ -- Select Ubuntu Server 22.04 TLS AMI -- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. -- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance - -.. _install_drivers: - -Step 1. Install drivers and tools -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: - -.. code-block:: console - - # Configure Linux for Neuron repository updates - . /etc/os-release - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances. -Follow the steps below to install transformer-neuronx package and its dependencies. - -.. code-block:: console - - # Install Python venv - sudo apt-get install -y python3.10-venv g++ - - # Create Python venv - python3.10 -m venv aws_neuron_venv_pytorch - - # Activate Python venv - source aws_neuron_venv_pytorch/bin/activate - - # Install Jupyter notebook kernel - pip install ipykernel - python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" - pip install jupyter notebook - pip install environment_kernels - - # Set pip repository pointing to the Neuron repository - python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com - - # Install wget, awscli - python -m pip install wget - python -m pip install awscli - - # Update Neuron Compiler and Framework - python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx - -.. _install_vllm: - -Step 3. Install vLLM from source -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -U -r requirements-neuron.txt - $ VLLM_TARGET_DEVICE="neuron" pip install . - -If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md new file mode 100644 index 0000000000000..8b43c0a90447f --- /dev/null +++ b/docs/source/getting_started/openvino-installation.md @@ -0,0 +1,104 @@ +(installation-openvino)= + +# Installation with OpenVINO + +vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) + +**Table of contents**: + +- [Requirements](#openvino-backend-requirements) +- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) +- [Build from source](#install-openvino-backend-from-source) +- [Performance tips](#openvino-backend-performance-tips) +- [Limitations](#openvino-backend-limitations) + +(openvino-backend-requirements)= + +## Requirements + +- OS: Linux +- Instruction set architecture (ISA) requirement: at least AVX2. + +(openvino-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.openvino -t vllm-openvino-env . +$ docker run -it --rm vllm-openvino-env +``` + +(install-openvino-backend-from-source)= + +## Install from source + +- First, install Python. For example, on Ubuntu 22.04, you can run: + + ```console + $ sudo apt-get update -y + $ sudo apt-get install python3 + ``` + +- Second, install prerequisites vLLM OpenVINO backend installation: + + ```console + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +- Finally, install vLLM with OpenVINO backend: + + ```console + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + ``` + +- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). + +(openvino-backend-performance-tips)= + +## Performance tips + +### vLLM OpenVINO backend environment variables + +- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` + +### CPU performance tips + +CPU uses the following environment variables to control behavior: + +- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`) + +OpenVINO best known configuration for CPU is: + +```console +$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 +``` + +### GPU performance tips + +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache). + +Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. + +OpenVINO best known configuration for GPU is: + +```console +$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +(openvino-backend-limitations)= + +## Limitations + +- LoRA serving is not supported. +- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. +- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst deleted file mode 100644 index 5eeb7c78f7e51..0000000000000 --- a/docs/source/getting_started/openvino-installation.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _installation_openvino: - -Installation with OpenVINO -========================== - -vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs `_). OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (``--enable-prefix-caching``) -- Chunked prefill (``--enable-chunked-prefill``) - -**Table of contents**: - -- :ref:`Requirements ` -- :ref:`Quick start using Dockerfile ` -- :ref:`Build from source ` -- :ref:`Performance tips ` -- :ref:`Limitations ` - -.. _openvino_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Instruction set architecture (ISA) requirement: at least AVX2. - -.. _openvino_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.openvino -t vllm-openvino-env . - $ docker run -it --rm vllm-openvino-env - -.. _install_openvino_backend_from_source: - -Install from source -------------------- - -- First, install Python. For example, on Ubuntu 22.04, you can run: - - .. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install python3 - -- Second, install prerequisites vLLM OpenVINO backend installation: - - .. code-block:: console - - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, install vLLM with OpenVINO backend: - - .. code-block:: console - - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html `_. - -.. _openvino_backend_performance_tips: - -Performance tips ----------------- - -vLLM OpenVINO backend environment variables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. - -- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` - -CPU performance tips -~~~~~~~~~~~~~~~~~~~~ - -CPU uses the following environment variables to control behavior: - -- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. - -To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) - -OpenVINO best known configuration for CPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 - -GPU performance tips -~~~~~~~~~~~~~~~~~~~~ -GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). - -Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. - -OpenVINO best known configuration for GPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json - -.. _openvino_backend_limitations: - -Limitations ------------ - -- LoRA serving is not supported. - -- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. - -- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md new file mode 100644 index 0000000000000..165e5df146dcd --- /dev/null +++ b/docs/source/getting_started/quickstart.md @@ -0,0 +1,175 @@ +(quickstart)= + +# Quickstart + +This guide will help you quickly get started with vLLM to: + +- [Run offline batched inference](#offline-batched-inference) +- [Run OpenAI-compatible inference](#openai-compatible-server) + +## Prerequisites + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Installation + +You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. + +```console +$ conda create -n myenv python=3.10 -y +$ conda activate myenv +$ pip install vllm +``` + +Please refer to the {ref}`installation documentation ` for more details on installing vLLM. + +(offline-batched-inference)= + +## Offline Batched Inference + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: + +The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: + +- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process. + +```python +from vllm import LLM, SamplingParams +``` + +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). + +```python +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +``` + +The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models). + +```python +llm = LLM(model="facebook/opt-125m") +``` + +```{note} +By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. +``` + +Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. + +```python +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +(openai-compatible-server)= + +## OpenAI-Compatible Server + +vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. +By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints. + +Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: + +```console +$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +``` + +```{note} +By default, the server uses a predefined chat template stored in the tokenizer. +You can learn about overriding it [here](#chat-template). +``` + +This server can be queried in the same format as OpenAI API. For example, to list the models: + +```console +$ curl http://localhost:8000/v1/models +``` + +You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. + +### OpenAI Completions API with vLLM + +Once your server is started, you can query the model with input prompts: + +```console +$ curl http://localhost:8000/v1/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "prompt": "San Francisco is a", +$ "max_tokens": 7, +$ "temperature": 0 +$ }' +``` + +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) +completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") +print("Completion result:", completion) +``` + +A more detailed client example can be found here: + +### OpenAI Chat Completions API with vLLM + +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. + +You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: + +```console +$ curl http://localhost:8000/v1/chat/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "messages": [ +$ {"role": "system", "content": "You are a helpful assistant."}, +$ {"role": "user", "content": "Who won the world series in 2020?"} +$ ] +$ }' +``` + +Alternatively, you can use the `openai` python package: + +```python +from openai import OpenAI +# Set OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a joke."}, + ] +) +print("Chat response:", chat_response) +``` diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst deleted file mode 100644 index 0c0491c860563..0000000000000 --- a/docs/source/getting_started/quickstart.rst +++ /dev/null @@ -1,181 +0,0 @@ -.. _quickstart: - -========== -Quickstart -========== - -This guide will help you quickly get started with vLLM to: - -* :ref:`Run offline batched inference ` -* :ref:`Run OpenAI-compatible inference ` - -Prerequisites --------------- -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Installation --------------- - -You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. - -.. code-block:: console - - $ conda create -n myenv python=3.10 -y - $ conda activate myenv - $ pip install vllm - -Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - -.. _offline_batched_inference: - -Offline Batched Inference -------------------------- - -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. - -The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: - -- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. -- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. - -.. code-block:: python - - from vllm import LLM, SamplingParams - -The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. - -.. code-block:: python - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - -.. note:: - - By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. - -Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. - -.. code-block:: python - - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -.. _openai_compatible_server: - -OpenAI-Compatible Server ------------------------- - -vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. - -Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: - -.. code-block:: console - - $ vllm serve Qwen/Qwen2.5-1.5B-Instruct - -.. note:: - - By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. - -This server can be queried in the same format as OpenAI API. For example, to list the models: - -.. code-block:: console - - $ curl http://localhost:8000/v1/models - -You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. - -OpenAI Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once your server is started, you can query the model with input prompts: - -.. code-block:: console - - $ curl http://localhost:8000/v1/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "prompt": "San Francisco is a", - $ "max_tokens": 7, - $ "temperature": 0 - $ }' - -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") - print("Completion result:", completion) - -A more detailed client example can be found `here `__. - -OpenAI Chat Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. - -You can use the `create chat completion `_ endpoint to interact with the model: - -.. code-block:: console - - $ curl http://localhost:8000/v1/chat/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "messages": [ - $ {"role": "system", "content": "You are a helpful assistant."}, - $ {"role": "user", "content": "Who won the world series in 2020?"} - $ ] - $ }' - -Alternatively, you can use the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - # Set OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - chat_response = client.chat.completions.create( - model="Qwen/Qwen2.5-1.5B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Tell me a joke."}, - ] - ) - print("Chat response:", chat_response) diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md new file mode 100644 index 0000000000000..f2a949e7247d8 --- /dev/null +++ b/docs/source/getting_started/tpu-installation.md @@ -0,0 +1,192 @@ +(installation-tpu)= + +# Installation with TPU + +Tensor Processing Units (TPUs) are Google's custom-developed application-specific +integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs +are available in different versions each with different hardware specifications. +For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm). +For more information on the TPU versions supported with vLLM, see: + +- [TPU v6e](https://cloud.google.com/tpu/docs/v6e) +- [TPU v5e](https://cloud.google.com/tpu/docs/v5e) +- [TPU v5p](https://cloud.google.com/tpu/docs/v5p) +- [TPU v4](https://cloud.google.com/tpu/docs/v4) + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations) +- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config) +- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config) +- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config) + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota). + +For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing). + +You may need additional persistent storage for your TPU VMs. For more +information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). + +## Requirements + +- Google Cloud TPU VM +- TPU versions: v6e, v5e, v5p, v4 +- Python: 3.10 or newer + +### Provision Cloud TPUs + +You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest) +or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources) +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api). +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. + +```{note} +In all of the following commands, replace the ALL CAPS parameter names with +appropriate values. See the parameter descriptions table for more information. +``` + +## Provision a Cloud TPU with the queued resource API + +Create a TPU v5e with 4 TPU chips: + +```console +gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ +--node-id TPU_NAME \ +--project PROJECT_ID \ +--zone ZONE \ +--accelerator-type ACCELERATOR_TYPE \ +--runtime-version RUNTIME_VERSION \ +--service-account SERVICE_ACCOUNT +``` + +```{eval-rst} +.. list-table:: Parameter descriptions + :header-rows: 1 + + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` +``` + +Connect to your TPU using SSH: + +```bash +gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE +``` + +Install Miniconda + +```bash +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh +source ~/.bashrc +``` + +Create and activate a Conda environment for vLLM: + +```bash +conda create -n vllm python=3.10 -y +conda activate vllm +``` + +Clone the vLLM repository and go to the vLLM directory: + +```bash +git clone https://github.com/vllm-project/vllm.git && cd vllm +``` + +Uninstall the existing `torch` and `torch_xla` packages: + +```bash +pip uninstall torch torch-xla -y +``` + +Install build dependencies: + +```bash +pip install -r requirements-tpu.txt +sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` + +Run the setup script: + +```bash +VLLM_TARGET_DEVICE="tpu" python setup.py develop +``` + +## Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see + + + + +(build-docker-tpu)= + +## Build a docker image with {code}`Dockerfile.tpu` + +You can use to build a Docker image with TPU support. + +```console +$ docker build -f Dockerfile.tpu -t vllm-tpu . +``` + +Run the Docker image with the following command: + +```console +$ # Make sure to add `--privileged --net host --shm-size=16G`. +$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +``` + +```{note} +Since TPU relies on XLA which requires static shapes, vLLM bucketizes the +possible input shapes and compiles an XLA graph for each shape. The +compilation time may take 20~30 minutes in the first run. However, the +compilation time reduces to ~5 minutes afterwards because the XLA graphs are +cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). +``` + +````{tip} +If you encounter the following error: + +```console +from torch._C import * # noqa: F403 +ImportError: libopenblas.so.0: cannot open shared object file: No such +file or directory +``` + +Install OpenBLAS with the following command: + +```console +$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` +```` diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst deleted file mode 100644 index 22cc684a1c778..0000000000000 --- a/docs/source/getting_started/tpu-installation.rst +++ /dev/null @@ -1,200 +0,0 @@ -.. _installation_tpu: - -##################### -Installation with TPU -##################### - -Tensor Processing Units (TPUs) are Google's custom-developed application-specific -integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs -are available in different versions each with different hardware specifications. -For more information about TPUs, see `TPU System Architecture `_. -For more information on the TPU versions supported with vLLM, see: - -* `TPU v6e `_ -* `TPU v5e `_ -* `TPU v5p `_ -* `TPU v4 `_ - -These TPU versions allow you to configure the physical arrangements of the TPU -chips. This can improve throughput and networking performance. For more -information see: - -* `TPU v6e topologies `_ -* `TPU v5e topologies `_ -* `TPU v5p topologies `_ -* `TPU v4 topologies `_ - -In order for you to use Cloud TPUs you need to have TPU quota granted to your -Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a -GPC project and are specified in terms of TPU version, the number of TPU you -want to use, and quota type. For more information, see `TPU quota `_. - -For TPU pricing information, see `Cloud TPU pricing `_. - -You may need additional persistent storage for your TPU VMs. For more -information, see `Storage options for Cloud TPU data `_. - -Requirements ------------- - -* Google Cloud TPU VM -* TPU versions: v6e, v5e, v5p, v4 -* Python: 3.10 or newer - -Provision Cloud TPUs -==================== - -You can provision Cloud TPUs using the `Cloud TPU API `_ -or the `queued resources `_ -API. This section shows how to create TPUs using the queued resource API. For -more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. -Queued resources enable you to request Cloud TPU resources in a queued manner. -When you request queued resources, the request is added to a queue maintained by -the Cloud TPU service. When the requested resource becomes available, it's -assigned to your Google Cloud project for your immediate exclusive use. - -.. note:: - In all of the following commands, replace the ALL CAPS parameter names with - appropriate values. See the parameter descriptions table for more information. - -Provision a Cloud TPU with the queued resource API --------------------------------------------------- -Create a TPU v5e with 4 TPU chips: - -.. code-block:: console - - gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ - --node-id TPU_NAME \ - --project PROJECT_ID \ - --zone ZONE \ - --accelerator-type ACCELERATOR_TYPE \ - --runtime-version RUNTIME_VERSION \ - --service-account SERVICE_ACCOUNT - - -.. list-table:: Parameter descriptions - :header-rows: 1 - - * - Parameter name - - Description - * - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. - * - TPU_NAME - - The user-assigned name of the TPU which is created when the queued - resource request is allocated. - * - PROJECT_ID - - Your Google Cloud project - * - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ - * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, - see `TPU versions `_. - * - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. - * - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` - -Connect to your TPU using SSH: - -.. code-block:: bash - - gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE - -Install Miniconda - -.. code-block:: bash - - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh - source ~/.bashrc - -Create and activate a Conda environment for vLLM: - -.. code-block:: bash - - conda create -n vllm python=3.10 -y - conda activate vllm - -Clone the vLLM repository and go to the vLLM directory: - -.. code-block:: bash - - git clone https://github.com/vllm-project/vllm.git && cd vllm - -Uninstall the existing `torch` and `torch_xla` packages: - -.. code-block:: bash - - pip uninstall torch torch-xla -y - -Install build dependencies: - -.. code-block:: bash - - pip install -r requirements-tpu.txt - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - -Run the setup script: - -.. code-block:: bash - - VLLM_TARGET_DEVICE="tpu" python setup.py develop - - -Provision Cloud TPUs with GKE ------------------------------ - -For more information about using TPUs with GKE, see -https://cloud.google.com/kubernetes-engine/docs/how-to/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus - -.. _build_docker_tpu: - -Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------- - -You can use `Dockerfile.tpu `_ -to build a Docker image with TPU support. - -.. code-block:: console - - $ docker build -f Dockerfile.tpu -t vllm-tpu . - -Run the Docker image with the following command: - -.. code-block:: console - - $ # Make sure to add `--privileged --net host --shm-size=16G`. - $ docker run --privileged --net host --shm-size=16G -it vllm-tpu - -.. note:: - - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the - possible input shapes and compiles an XLA graph for each shape. The - compilation time may take 20~30 minutes in the first run. However, the - compilation time reduces to ~5 minutes afterwards because the XLA graphs are - cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). - -.. tip:: - - If you encounter the following error: - - .. code-block:: console - - from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such - file or directory - - - Install OpenBLAS with the following command: - - .. code-block:: console - - $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md new file mode 100644 index 0000000000000..9554ae4b7fb44 --- /dev/null +++ b/docs/source/getting_started/xpu-installation.md @@ -0,0 +1,74 @@ +(installation-xpu)= + +# Installation with XPU + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +1. [Requirements](#xpu-backend-requirements) +2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) +3. [Build from source](#build-xpu-backend-from-source) + +(xpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Supported Hardware: Intel Data Center GPU, Intel ARC GPU +- OneAPI requirements: oneAPI 2024.2 + +(xpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env +``` + +(build-xpu-backend-from-source)= + +## Build from source + +- First, install required driver and intel OneAPI 2024.2 or later. +- Second, install Python packages for vLLM XPU backend building: + +```console +$ source /opt/intel/oneapi/setvars.sh +$ pip install --upgrade pip +$ pip install -v -r requirements-xpu.txt +``` + +- Finally, build and install vLLM XPU backend: + +```console +$ VLLM_TARGET_DEVICE=xpu python setup.py install +``` + +```{note} +- FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. +``` + +## Distributed inference and serving + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +```console +$ python -m vllm.entrypoints.openai.api_server \ +$ --model=facebook/opt-13b \ +$ --dtype=bfloat16 \ +$ --device=xpu \ +$ --max_model_len=1024 \ +$ --distributed-executor-backend=ray \ +$ --pipeline-parallel-size=2 \ +$ -tp=8 +``` + +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst deleted file mode 100644 index b1868acbc84b0..0000000000000 --- a/docs/source/getting_started/xpu-installation.rst +++ /dev/null @@ -1,80 +0,0 @@ -.. _installation_xpu: - -Installation with XPU -======================== - -vLLM initially supports basic model inferencing and serving on Intel GPU platform. - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` - -.. _xpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Supported Hardware: Intel Data Center GPU, Intel ARC GPU -* OneAPI requirements: oneAPI 2024.2 - -.. _xpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - vllm-xpu-env - -.. _build_xpu_backend_from_source: - -Build from source ------------------ - -- First, install required driver and intel OneAPI 2024.2 or later. - -- Second, install Python packages for vLLM XPU backend building: - -.. code-block:: console - - $ source /opt/intel/oneapi/setvars.sh - $ pip install --upgrade pip - $ pip install -v -r requirements-xpu.txt - -- Finally, build and install vLLM XPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=xpu python setup.py install - -.. note:: - - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. - - -Distributed inference and serving ---------------------------------- - -XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: - -.. code-block:: console - - $ python -m vllm.entrypoints.openai.api_server \ - $ --model=facebook/opt-13b \ - $ --dtype=bfloat16 \ - $ --device=xpu \ - $ --max_model_len=1024 \ - $ --distributed-executor-backend=ray \ - $ --pipeline-parallel-size=2 \ - $ -tp=8 - -By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000000000..34f9c4caebe6f --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,200 @@ +# Welcome to vLLM! + +```{figure} ./assets/logos/vllm-logo-text-light.png +:align: center +:alt: vLLM +:class: no-scaled-link +:width: 60% +``` + +```{raw} html +

+Easy, fast, and cheap LLM serving for everyone + +

+ +

+ +Star +Watch +Fork +

+``` + +vLLM is a fast and easy-to-use library for LLM inference and serving. + +vLLM is fast with: + +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with **PagedAttention** +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill + +vLLM is flexible and easy to use with: + +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Prefix caching support +- Multi-lora support + +For more information, check out the following: + +- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) +- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) +- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. +- {ref}`vLLM Meetups `. + +## Documentation + +```{toctree} +:caption: Getting Started +:maxdepth: 1 + +getting_started/installation +getting_started/amd-installation +getting_started/openvino-installation +getting_started/cpu-installation +getting_started/gaudi-installation +getting_started/arm-installation +getting_started/neuron-installation +getting_started/tpu-installation +getting_started/xpu-installation +getting_started/quickstart +getting_started/debugging +getting_started/examples/examples_index +``` + +```{toctree} +:caption: Serving +:maxdepth: 1 + +serving/openai_compatible_server +serving/deploying_with_docker +serving/deploying_with_k8s +serving/deploying_with_helm +serving/deploying_with_nginx +serving/distributed_serving +serving/metrics +serving/integrations +serving/tensorizer +serving/runai_model_streamer +``` + +```{toctree} +:caption: Models +:maxdepth: 1 + +models/supported_models +models/generative_models +models/pooling_models +models/adding_model +models/enabling_multimodal_inputs +``` + +```{toctree} +:caption: Usage +:maxdepth: 1 + +usage/lora +usage/multimodal_inputs +usage/tool_calling +usage/structured_outputs +usage/spec_decode +usage/compatibility_matrix +usage/performance +usage/faq +usage/engine_args +usage/env_vars +usage/usage_stats +usage/disagg_prefill +``` + +```{toctree} +:caption: Quantization +:maxdepth: 1 + +quantization/supported_hardware +quantization/auto_awq +quantization/bnb +quantization/gguf +quantization/int8 +quantization/fp8 +quantization/fp8_e5m2_kvcache +quantization/fp8_e4m3_kvcache +``` + +```{toctree} +:caption: Automatic Prefix Caching +:maxdepth: 1 + +automatic_prefix_caching/apc +automatic_prefix_caching/details +``` + +```{toctree} +:caption: Performance +:maxdepth: 1 + +performance/benchmarks +``` + +% Community: User community resources + +```{toctree} +:caption: Community +:maxdepth: 1 + +community/meetups +community/sponsors +``` + +% API Documentation: API reference aimed at vllm library usage + +```{toctree} +:caption: API Documentation +:maxdepth: 2 + +dev/sampling_params +dev/pooling_params +dev/offline_inference/offline_index +dev/engine/engine_index +``` + +% Design: docs about vLLM internals + +```{toctree} +:caption: Design +:maxdepth: 2 + +design/arch_overview +design/huggingface_integration +design/plugin_system +design/input_processing/model_inputs_index +design/kernel/paged_attention +design/multimodal/multimodal_index +design/multiprocessing +``` + +% For Developers: contributing to the vLLM project + +```{toctree} +:caption: For Developers +:maxdepth: 2 + +contributing/overview +contributing/profiling/profiling_index +contributing/dockerfile/dockerfile +``` + +# Indices and tables + +- {ref}`genindex` +- {ref}`modindex` diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index fd741ea5e9766..0000000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,193 +0,0 @@ -Welcome to vLLM! -================ - -.. figure:: ./assets/logos/vllm-logo-text-light.png - :width: 60% - :align: center - :alt: vLLM - :class: no-scaled-link - -.. raw:: html - -

- Easy, fast, and cheap LLM serving for everyone - -

- -

- - Star - Watch - Fork -

- - - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -vLLM is fast with: - -* State-of-the-art serving throughput -* Efficient management of attention key and value memory with **PagedAttention** -* Continuous batching of incoming requests -* Fast model execution with CUDA/HIP graph -* Quantization: `GPTQ `_, `AWQ `_, INT4, INT8, and FP8 -* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. -* Speculative decoding -* Chunked prefill - -vLLM is flexible and easy to use with: - -* Seamless integration with popular HuggingFace models -* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -* Tensor parallelism and pipeline parallelism support for distributed inference -* Streaming outputs -* OpenAI-compatible API server -* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. -* Prefix caching support -* Multi-lora support - -For more information, check out the following: - -* `vLLM announcing blog post `_ (intro to PagedAttention) -* `vLLM paper `_ (SOSP 2023) -* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. -* :ref:`vLLM Meetups `. - - -Documentation -------------- - -.. toctree:: - :maxdepth: 1 - :caption: Getting Started - - getting_started/installation - getting_started/amd-installation - getting_started/openvino-installation - getting_started/cpu-installation - getting_started/gaudi-installation - getting_started/arm-installation - getting_started/neuron-installation - getting_started/tpu-installation - getting_started/xpu-installation - getting_started/quickstart - getting_started/debugging - getting_started/examples/examples_index - -.. toctree:: - :maxdepth: 1 - :caption: Serving - - serving/openai_compatible_server - serving/deploying_with_docker - serving/deploying_with_k8s - serving/deploying_with_helm - serving/deploying_with_nginx - serving/distributed_serving - serving/metrics - serving/integrations - serving/tensorizer - -.. toctree:: - :maxdepth: 1 - :caption: Models - - models/supported_models - models/generative_models - models/pooling_models - models/adding_model - models/enabling_multimodal_inputs - -.. toctree:: - :maxdepth: 1 - :caption: Usage - - usage/lora - usage/multimodal_inputs - usage/tool_calling - usage/structured_outputs - usage/spec_decode - usage/compatibility_matrix - usage/performance - usage/faq - usage/engine_args - usage/env_vars - usage/usage_stats - usage/disagg_prefill - -.. toctree:: - :maxdepth: 1 - :caption: Quantization - - quantization/supported_hardware - quantization/auto_awq - quantization/bnb - quantization/gguf - quantization/int8 - quantization/fp8 - quantization/fp8_e5m2_kvcache - quantization/fp8_e4m3_kvcache - -.. toctree:: - :maxdepth: 1 - :caption: Automatic Prefix Caching - - automatic_prefix_caching/apc - automatic_prefix_caching/details - -.. toctree:: - :maxdepth: 1 - :caption: Performance - - performance/benchmarks - -.. Community: User community resources - -.. toctree:: - :maxdepth: 1 - :caption: Community - - community/meetups - community/sponsors - -.. API Documentation: API reference aimed at vllm library usage - -.. toctree:: - :maxdepth: 2 - :caption: API Documentation - - dev/sampling_params - dev/pooling_params - dev/offline_inference/offline_index - dev/engine/engine_index - -.. Design: docs about vLLM internals - -.. toctree:: - :maxdepth: 2 - :caption: Design - - design/arch_overview - design/huggingface_integration - design/plugin_system - design/input_processing/model_inputs_index - design/kernel/paged_attention - design/multimodal/multimodal_index - design/multiprocessing - -.. For Developers: contributing to the vLLM project - -.. toctree:: - :maxdepth: 2 - :caption: For Developers - - contributing/overview - contributing/profiling/profiling_index - contributing/dockerfile/dockerfile - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md new file mode 100644 index 0000000000000..02537fba020c4 --- /dev/null +++ b/docs/source/models/adding_model.md @@ -0,0 +1,155 @@ +(adding-a-new-model)= + +# Adding a New Model + +This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{note} +By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, +please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. +We will be happy to help you out! +``` + +## 0. Fork the vLLM repository + +Start by forking our [GitHub] repository and then [build it from source](#build-from-source). +This gives you the ability to modify the codebase and test your model. + +```{tip} +If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. +``` + +## 1. Bring your model code + +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. +For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . + +## 6. Out-of-Tree Model Integration + +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that [here](#enabling-multimodal-inputs). +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst deleted file mode 100644 index df06d736ca86b..0000000000000 --- a/docs/source/models/adding_model.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. _adding_a_new_model: - -Adding a New Model -================== - -This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. - -.. note:: - The complexity of adding a new model depends heavily on the model's architecture. - The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. - However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. - -.. note:: - By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, - please follow :ref:`this guide ` after implementing the model here. - -.. tip:: - If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. - We will be happy to help you out! - - -0. Fork the vLLM repository --------------------------------- - -Start by forking our `GitHub`_ repository and then :ref:`build it from source `. -This gives you the ability to modify the codebase and test your model. - -.. tip:: - If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. - -1. Bring your model code ------------------------- - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. -For instance, vLLM's `OPT model `_ was adapted from the HuggingFace's `modeling_opt.py `_ file. - -.. warning:: - When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. - - -2. Make your code compatible with vLLM --------------------------------------- - -To ensure compatibility with vLLM, your model must meet the following requirements: - -Initialization Code -^^^^^^^^^^^^^^^^^^^ - -All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for: - -* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -.. code-block:: python - - from torch import nn - from vllm.config import VllmConfig - from vllm.attention import Attention - - class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - - class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - - class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - - class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") - -Computation Code -^^^^^^^^^^^^^^^^ - -Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -.. code-block:: python - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - ... - -.. note:: - Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. - If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. - -For reference, check out the `LLAMA model `__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models `__ directory for more examples. - -3. (Optional) Implement tensor parallelism and quantization support -------------------------------------------------------------------- - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -4. Implement the weight loading logic -------------------------------------- - -You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -5. Register your model ----------------------- - -Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. - -6. Out-of-Tree Model Integration --------------------------------- - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. - -To register the model, use the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -.. code-block:: python - - from vllm import ModelRegistry - - ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") - -.. important:: - If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - Read more about that :ref:`here `. - -.. note:: - Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md new file mode 100644 index 0000000000000..fdd770887900e --- /dev/null +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -0,0 +1,143 @@ +(enabling-multimodal-inputs)= + +# Enabling Multimodal Inputs + +This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). + +```{seealso} +[Adding a New Model](adding-a-new-model) +``` + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). +Further update the model as follows: + +- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + + ```{note} + The model class does not have to be named {code}`*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + ``` + +- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` + for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + +## 2. Register input mappers + +For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. +This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_image_input_mapper() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 3. Register maximum number of multi-modal tokens + +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item +and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() ++ @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 4. (Optional) Register dummy data + +During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. +In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() ++ @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +```{note} +The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 5. (Optional) Register input processor + +Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. +This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. +You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() ++ @INPUT_REGISTRY.register_input_processor() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. +Here are some examples: + +- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) +- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst deleted file mode 100644 index 5c1236e1a8972..0000000000000 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. _enabling_multimodal_inputs: - -Enabling Multimodal Inputs -========================== - -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. - -.. seealso:: - :ref:`adding_a_new_model` - - -1. Update the base vLLM model ------------------------------ - -It is assumed that you have already implemented the model in vLLM according to :ref:`these steps `. -Further update the model as follows: - -- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - .. code-block:: diff - - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - - .. note:: - The model class does not have to be named :code:`*ForCausalLM`. - Check out `the HuggingFace Transformers documentation `__ for some examples. - -- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - .. code-block:: diff - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - - -2. Register input mappers -------------------------- - -For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`. - -.. code-block:: diff - - from vllm.model_executor.models.interfaces import SupportsMultiModal - + from vllm.multimodal import MULTIMODAL_REGISTRY - - + @MULTIMODAL_REGISTRY.register_image_input_mapper() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. - -.. seealso:: - :ref:`input_processing_pipeline` - - -3. Register maximum number of multi-modal tokens ------------------------------------------------- - -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - + @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -4. (Optional) Register dummy data ---------------------------------- - -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - + @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -.. note:: - The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -5. (Optional) Register input processor --------------------------------------- - -Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call. -You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - + @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: - -- Insert static number of image tokens: `LLaVA-1.5 Model `__ -- Insert dynamic number of image tokens: `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md new file mode 100644 index 0000000000000..35e0302b86619 --- /dev/null +++ b/docs/source/models/generative_models.md @@ -0,0 +1,126 @@ +(generative-models)= + +# Generative Models + +vLLM provides first-class support for generative models, which covers most of LLMs. + +In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, +which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. + +For generative models, the only supported {code}`task` option is {code}`"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + +### `LLM.generate` + +The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. +It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), +except that tokenization and detokenization are also performed automatically. + +```python +llm = LLM(model="facebook/opt-125m") +outputs = llm.generate("Hello, my name is") + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. +For example, you can use greedy sampling by setting {code}`temperature=0`: + +```python +llm = LLM(model="facebook/opt-125m") +params = SamplingParams(temperature=0) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found here: + +### `LLM.beam_search` + +The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`. +For example, to search using 5 beams and output at most 50 tokens: + +```python +llm = LLM(model="facebook/opt-125m") +params = BeamSearchParams(beam_width=5, max_tokens=50) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +### `LLM.chat` + +The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`. +In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. + +```{important} +In general, only instruction-tuned models have a chat template. +Base models may perform poorly as they are not trained to respond to the chat conversation. +``` + +```python +llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +outputs = llm.chat(conversation) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found here: + +If the model doesn't have a chat template or you want to specify another one, +you can explicitly pass a chat template: + +```python +from vllm.entrypoints.chat_utils import load_chat_template + +# You can find a list of existing chat templates under `examples/` +custom_template = load_chat_template(chat_template="") +print("Loaded chat template:", custom_template) + +outputs = llm.chat(conversation, chat_template=custom_template) +``` + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: + +- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. +- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst deleted file mode 100644 index fb71185600863..0000000000000 --- a/docs/source/models/generative_models.rst +++ /dev/null @@ -1,146 +0,0 @@ -.. _generative_models: - -Generative Models -================= - -vLLM provides first-class support for generative models, which covers most of LLMs. - -In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface. -Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For generative models, the only supported :code:`task` option is :code:`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - -``LLM.generate`` -^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM. -It is similar to `its counterpart in HF Transformers `__, -except that tokenization and detokenization are also performed automatically. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - outputs = llm.generate("Hello, my name is") - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -You can optionally control the language generation by passing :class:`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting :code:`temperature=0`: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = SamplingParams(temperature=0) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference.py `_. - -``LLM.beam_search`` -^^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.beam_search` method implements `beam search `__ on top of :class:`~vllm.LLM.generate`. -For example, to search using 5 beams and output at most 50 tokens: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = BeamSearchParams(beam_width=5, max_tokens=50) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -``LLM.chat`` -^^^^^^^^^^^^ - -The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`. -In particular, it accepts input similar to `OpenAI Chat Completions API `__ -and automatically applies the model's `chat template `__ to format the prompt. - -.. important:: - - In general, only instruction-tuned models have a chat template. - Base models may perform poorly as they are not trained to respond to the chat conversation. - -.. code-block:: python - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - outputs = llm.chat(conversation) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference_chat.py `_. - -If the model doesn't have a chat template or you want to specify another one, -you can explicitly pass a chat template: - -.. code-block:: python - - from vllm.entrypoints.chat_utils import load_chat_template - - # You can find a list of existing chat templates under `examples/` - custom_template = load_chat_template(chat_template="") - print("Loaded chat template:", custom_template) - - outputs = llm.chat(conversation, chat_template=custom_template) - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Completions API -^^^^^^^^^^^^^^^ - -Our Completions API is similar to ``LLM.generate`` but only accepts text. -It is compatible with `OpenAI Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_completion_client.py `_. - -Chat API -^^^^^^^^ - -Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs `. -It is compatible with `OpenAI Chat Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_chat_completion_client.py `_. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md new file mode 100644 index 0000000000000..76c96c9edcc5d --- /dev/null +++ b/docs/source/models/pooling_models.md @@ -0,0 +1,113 @@ +(pooling-models)= + +# Pooling Models + +vLLM also supports pooling models, including embedding, reranking and reward models. + +In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface. +These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input +before returning them. + +```{note} +We currently support pooling models primarily as a matter of convenience. +As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to +pooling models as they only work on the generation or decode stage, so performance may not improve as much. +``` + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. + +For pooling models, we support the following {code}`task` options: + +- Embedding ({code}`"embed"` / {code}`"embedding"`) +- Classification ({code}`"classify"`) +- Sentence Pair Scoring ({code}`"score"`) +- Reward Modeling ({code}`"reward"`) + +The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: + +- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. +- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. +- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. +- Reward Modeling: Extract all of the hidden states and return them directly. + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). + +You can customize the model's pooling method via the {code}`override_pooler_config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +### `LLM.encode` + +The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. +It returns the extracted hidden states directly, which is useful for reward models. + +```python +llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") +(output,) = llm.encode("Hello, my name is") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +### `LLM.embed` + +The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. +It is primarily designed for embedding models. + +```python +llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") +(output,) = llm.embed("Hello, my name is") + +embeds = output.outputs.embedding +print(f"Embeddings: {embeds!r} (size={len(embeds)})") +``` + +A code example can be found here: + +### `LLM.classify` + +The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt. +It is primarily designed for classification models. + +```python +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") +(output,) = llm.classify("Hello, my name is") + +probs = output.outputs.probs +print(f"Class Probabilities: {probs!r} (size={len(probs)})") +``` + +A code example can be found here: + +### `LLM.score` + +The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html). +These types of models serve as rerankers between candidate query-document pairs in RAG systems. + +```{note} +vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. +To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). +``` + +```python +llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") +(output,) = llm.score("What is the capital of France?", + "The capital of Brazil is Brasilia.") + +score = output.outputs.score +print(f"Score: {score}") +``` + +A code example can be found here: + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: + +- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. +- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. +- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models. diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst deleted file mode 100644 index 4e67677a2767a..0000000000000 --- a/docs/source/models/pooling_models.rst +++ /dev/null @@ -1,136 +0,0 @@ -.. _pooling_models: - -Pooling Models -============== - -vLLM also supports pooling models, including embedding, reranking and reward models. - -In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input -before returning them. - -.. note:: - - We currently support pooling models primarily as a matter of convenience. - As shown in the :ref:`Compatibility Matrix `, most vLLM features are not applicable to - pooling models as they only work on the generation or decode stage, so performance may not improve as much. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For pooling models, we support the following :code:`task` options: - -- Embedding (:code:`"embed"` / :code:`"embedding"`) -- Classification (:code:`"classify"`) -- Sentence Pair Scoring (:code:`"score"`) -- Reward Modeling (:code:`"reward"`) - -The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used: - -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. - -When loading `Sentence Transformers `__ models, -we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`). - -You can customize the model's pooling method via the :code:`override_pooler_config` option, -which takes priority over both the model's and Sentence Transformers's defaults. - -``LLM.encode`` -^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. -It returns the extracted hidden states directly, which is useful for reward models. - -.. code-block:: python - - llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") - (output,) = llm.encode("Hello, my name is") - - data = output.outputs.data - print(f"Data: {data!r}") - -``LLM.embed`` -^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt. -It is primarily designed for embedding models. - -.. code-block:: python - - llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") - (output,) = llm.embed("Hello, my name is") - - embeds = output.outputs.embedding - print(f"Embeddings: {embeds!r} (size={len(embeds)})") - -A code example can be found in `examples/offline_inference_embedding.py `_. - -``LLM.classify`` -^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt. -It is primarily designed for classification models. - -.. code-block:: python - - llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") - (output,) = llm.classify("Hello, my name is") - - probs = output.outputs.probs - print(f"Class Probabilities: {probs!r} (size={len(probs)})") - -A code example can be found in `examples/offline_inference_classification.py `_. - -``LLM.score`` -^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs. -It is primarily designed for `cross-encoder models `__. -These types of models serve as rerankers between candidate query-document pairs in RAG systems. - -.. note:: - - vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. - To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. - -.. code-block:: python - - llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") - (output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") - - score = output.outputs.score - print(f"Score: {score}") - -A code example can be found in `examples/offline_inference_scoring.py `_. - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Embeddings API -^^^^^^^^^^^^^^ - -Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs `. - -The text-only API is compatible with `OpenAI Embeddings API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_embedding_client.py `_. - -The multi-modal API is an extension of the `OpenAI Embeddings API `__ -that incorporates `OpenAI Chat Completions API `__, -so it is not part of the OpenAI standard. Please see :ref:`this page ` for more details on how to use it. - -Score API -^^^^^^^^^ - -Our Score API is similar to ``LLM.score``. -Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.md similarity index 70% rename from docs/source/models/supported_models.rst rename to docs/source/models/supported_models.md index 488fcc7709c77..099e6c8f02815 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.md @@ -1,84 +1,78 @@ -.. _supported_models: +(supported-models)= -Supported Models -================ +# Supported Models vLLM supports generative and pooling models across various tasks. -If a model supports more than one task, you can set the task via the :code:`--task` argument. +If a model supports more than one task, you can set the task via the {code}`--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. -Loading a Model -^^^^^^^^^^^^^^^ +## Loading a Model -HuggingFace Hub -+++++++++++++++ +### HuggingFace Hub -By default, vLLM loads models from `HuggingFace (HF) Hub `_. +By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). -To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository. -If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. +To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository. +If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory. -.. tip:: - The easiest way to check if your model is really supported at runtime is to run the program below: +````{tip} +The easiest way to check if your model is really supported at runtime is to run the program below: - .. code-block:: python +```python +from vllm import LLM - from vllm import LLM +# For generative models (task=generate) only +llm = LLM(model=..., task="generate") # Name or path of your model +output = llm.generate("Hello, my name is") +print(output) - # For generative models (task=generate) only - llm = LLM(model=..., task="generate") # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) +# For pooling models (task={embed,classify,reward,score}) only +llm = LLM(model=..., task="embed") # Name or path of your model +output = llm.encode("Hello, my name is") +print(output) +``` - # For pooling models (task={embed,classify,reward}) only - llm = LLM(model=..., task="embed") # Name or path of your model - output = llm.encode("Hello, my name is") - print(output) +If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. +```` - If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. +Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. -Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` -for instructions on how to implement your model in vLLM. -Alternatively, you can `open an issue on GitHub `_ to request vLLM support. +### ModelScope -ModelScope -++++++++++ +To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: -To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: +```shell +$ export VLLM_USE_MODELSCOPE=True +``` -.. code-block:: shell +And use with {code}`trust_remote_code=True`. - $ export VLLM_USE_MODELSCOPE=True +```python +from vllm import LLM -And use with :code:`trust_remote_code=True`. +llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) -.. code-block:: python +# For generative models (task=generate) only +output = llm.generate("Hello, my name is") +print(output) - from vllm import LLM +# For pooling models (task={embed,classify,reward,score}) only +output = llm.encode("Hello, my name is") +print(output) +``` - llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) +## List of Text-only Language Models - # For generative models (task=generate) only - output = llm.generate("Hello, my name is") - print(output) +### Generative Models - # For pooling models (task={embed,classify,reward}) only - output = llm.encode("Hello, my name is") - print(output) +See [this page](#generative-models) for more information on how to use generative models. -List of Text-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Generative Models -+++++++++++++++++ - -See :ref:`this page ` for more information on how to use generative models. - -Text Generation (``--task generate``) -------------------------------------- +#### Text Generation (`--task generate`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -86,8 +80,8 @@ Text Generation (``--task generate``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`AquilaForCausalLM` - Aquila, Aquila2 - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. @@ -111,8 +105,8 @@ Text Generation (``--task generate``) * - :code:`BartForConditionalGeneration` - BART - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. - - - - + - + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. @@ -136,12 +130,12 @@ Text Generation (``--task generate``) * - :code:`DeepseekForCausalLM` - DeepSeek - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. - - + - - ✅︎ * - :code:`DeepseekV2ForCausalLM` - DeepSeek-V2 - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. - - + - - ✅︎ * - :code:`ExaoneForCausalLM` - EXAONE-3 @@ -316,7 +310,7 @@ Text Generation (``--task generate``) * - :code:`PersimmonForCausalLM` - Persimmon - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. - - + - - ✅︎ * - :code:`QWenLMHeadModel` - Qwen @@ -358,29 +352,24 @@ Text Generation (``--task generate``) - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - ✅︎ - ✅︎ +``` -.. note:: - Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. - -Pooling Models -++++++++++++++ - -See :ref:`this page ` for more information on how to use pooling models. +```{note} +Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +``` -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +### Pooling Models -Text Embedding (``--task embed``) ---------------------------------- +See [this page](pooling-models) for more information on how to use pooling models. -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. - -The following table lists those that are tested in vLLM. +#### Text Embedding (`--task embed`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -388,17 +377,17 @@ The following table lists those that are tested in vLLM. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertModel` - BERT-based - :code:`BAAI/bge-base-en-v1.5`, etc. - - - - + - + - * - :code:`Gemma2Model` - Gemma2-based - :code:`BAAI/bge-multilingual-gemma2`, etc. - - + - - ✅︎ * - :code:`GritLM` - GritLM @@ -418,28 +407,35 @@ The following table lists those that are tested in vLLM. * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` - RoBERTa-based - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. - - - - + - + - * - :code:`XLMRobertaModel` - XLM-RoBERTa-based - :code:`intfloat/multilingual-e5-large`, etc. - - - - + - + - +``` + +```{note} +{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. +You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`. +``` -.. note:: - :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. +```{note} +Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. +You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. -.. note:: - Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. - You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. +On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention +despite being described otherwise on its model card. +``` - On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention - despite being described otherwise on its model card. +If your model is not in the above list, we will try to automatically convert the model using +:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +of the whole prompt are extracted from the normalized hidden state corresponding to the last token. -Reward Modeling (``--task reward``) ------------------------------------ +#### Reward Modeling (`--task reward`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -447,8 +443,8 @@ Reward Modeling (``--task reward``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlamaForCausalLM` - Llama-based - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. @@ -459,14 +455,19 @@ Reward Modeling (``--task reward``) - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +``` + +If your model is not in the above list, we will try to automatically convert the model using +:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. -.. important:: - For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +```{important} +For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, +e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +``` -Classification (``--task classify``) ------------------------------------- +#### Classification (`--task classify`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -474,8 +475,8 @@ Classification (``--task classify``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`JambaForSequenceClassification` - Jamba - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. @@ -486,10 +487,14 @@ Classification (``--task classify``) - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - ✅︎ - ✅︎ +``` -Sentence Pair Scoring (``--task score``) ----------------------------------------- +If your model is not in the above list, we will try to automatically convert the model using +:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +#### Sentence Pair Scoring (`--task score`) + +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -497,54 +502,53 @@ Sentence Pair Scoring (``--task score``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertForSequenceClassification` - BERT-based - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - + - + - * - :code:`RobertaForSequenceClassification` - RoBERTa-based - :code:`cross-encoder/quora-roberta-base`, etc. - - - - + - + - * - :code:`XLMRobertaForSequenceClassification` - XLM-RoBERTa-based - :code:`BAAI/bge-reranker-v2-m3`, etc. - - - - + - + - +``` -.. _supported_mm_models: +(supported-mm-models)= -List of Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +## List of Multimodal Language Models The following modalities are supported depending on the model: -- **T**\ ext -- **I**\ mage -- **V**\ ideo -- **A**\ udio +- **T**ext +- **I**mage +- **V**ideo +- **A**udio -Any combination of modalities joined by :code:`+` are supported. +Any combination of modalities joined by {code}`+` are supported. -- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. +- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs. -On the other hand, modalities separated by :code:`/` are mutually exclusive. +On the other hand, modalities separated by {code}`/` are mutually exclusive. -- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. +- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -See :ref:`this page ` on how to pass multi-modal inputs to the model. +See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. -Generative Models -+++++++++++++++++ +### Generative Models -See :ref:`this page ` for more information on how to use generative models. +See [this page](#generative-models) for more information on how to use generative models. -Text Generation (``--task generate``) -------------------------------------- +#### Text Generation (`--task generate`) +```{eval-rst} .. list-table:: :widths: 25 25 15 20 5 5 5 :header-rows: 1 @@ -553,63 +557,63 @@ Text Generation (``--task generate``) - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` - V1 * - :code:`AriaForConditionalGeneration` - Aria - T + I - :code:`rhymes-ai/Aria` - - + - - ✅︎ - - + - * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - T + I\ :sup:`E` - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ - - + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - T + I - :code:`facebook/chameleon-7b` etc. - - + - - ✅︎ - - + - * - :code:`FuyuForCausalLM` - Fuyu - T + I - :code:`adept/fuyu-8b` etc. - - + - - ✅︎ - - + - * - :code:`ChatGLMModel` - GLM-4V - T + I - :code:`THUDM/glm-4v-9b` etc. - ✅︎ - ✅︎ - - + - * - :code:`H2OVLChatModel` - H2OVL - T + I\ :sup:`E+` - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - + - - ✅︎ - - + - * - :code:`Idefics3ForConditionalGeneration` - Idefics3 - T + I - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - ✅︎ - - - + - * - :code:`InternVLChatModel` - InternVL 2.5, Mono-InternVL, InternVL 2.0 - T + I\ :sup:`E+` - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - + - - ✅︎ - ✅︎ * - :code:`LlavaForConditionalGeneration` @@ -625,28 +629,28 @@ Text Generation (``--task generate``) - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ - - + - * - :code:`LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ - - + - * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - * - :code:`MiniCPMV` - MiniCPM-V - T + I\ :sup:`E+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - ✅︎ - ✅︎ - - + - * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - T + I\ :sup:`+` @@ -665,7 +669,7 @@ Text Generation (``--task generate``) - NVLM-D 1.0 - T + I\ :sup:`E+` - :code:`nvidia/NVLM-D-72B`, etc. - - + - - ✅︎ - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` @@ -674,7 +678,7 @@ Text Generation (``--task generate``) - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - ✅︎ - - + - * - :code:`Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + I\ :sup:`E+` @@ -702,70 +706,79 @@ Text Generation (``--task generate``) - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - * - :code:`UltravoxModel` - Ultravox - T + A\ :sup:`E+` - :code:`fixie-ai/ultravox-v0_3` - - ✅︎ - - - -| :sup:`E` Pre-computed embeddings can be inputted for this modality. -| :sup:`+` Multiple items can be inputted per text prompt for this modality. + - +``` -.. important:: - To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) - or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: +```{eval-rst} +:sup:`E` Pre-computed embeddings can be inputted for this modality. - .. code-block:: python +:sup:`+` Multiple items can be inputted per text prompt for this modality. +``` - llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, - ) +````{important} +To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) +or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - .. code-block:: bash +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` - vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` -.. note:: - vLLM currently only supports adding LoRA to the language backbone of multimodal models. +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` -.. note:: - To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) - and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +```{note} +To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) +and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +``` -.. note:: - The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. - For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 +```{note} +The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. +For more details, please see: +``` -Pooling Models -++++++++++++++ +### Pooling Models -See :ref:`this page ` for more information on how to use pooling models. +See [this page](pooling-models) for more information on how to use pooling models. -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` -Text Embedding (``--task embed``) ---------------------------------- +#### Text Embedding (`--task embed`) -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. +Any text generation model can be converted into an embedding model by passing {code}`--task embed`. -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. +```{note} +To get the best results, you should use pooling models that are specifically trained as such. +``` The following table lists those that are tested in vLLM. +```{eval-rst} .. list-table:: :widths: 25 25 15 25 5 5 :header-rows: 1 @@ -774,13 +787,13 @@ The following table lists those that are tested in vLLM. - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT-based - T / I - :code:`royokong/e5-v` - - + - - ✅︎ * - :code:`Phi3VForCausalLM` - Phi-3-Vision-based @@ -792,27 +805,25 @@ The following table lists those that are tested in vLLM. - Qwen2-VL-based - T + I - :code:`MrLight/dse-qwen2-2b-mrl-v1` - - + - - ✅︎ +``` ----- +______________________________________________________________________ -Model Support Policy -===================== +# Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! - 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -.. tip:: - When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json `__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +```{tip} +When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. - 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. - 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. @@ -821,7 +832,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore We have the following levels of testing for models: -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests `_ for the models that have passed this test. +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test. 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md new file mode 100644 index 0000000000000..39dc470a1c708 --- /dev/null +++ b/docs/source/performance/benchmarks.md @@ -0,0 +1,28 @@ +(benchmarks)= + +# Benchmark Suites + +vLLM contains two sets of benchmarks: + +- [Performance benchmarks](#performance-benchmarks) +- [Nightly benchmarks](#nightly-benchmarks) + +(performance-benchmarks)= + +## Performance Benchmarks + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). + +More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). + +(nightly-benchmarks)= + +## Nightly Benchmarks + +These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. + +The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). + +More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst deleted file mode 100644 index 6d4d7b544cb5d..0000000000000 --- a/docs/source/performance/benchmarks.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _benchmarks: - -================ -Benchmark Suites -================ - -vLLM contains two sets of benchmarks: - -+ :ref:`Performance benchmarks ` -+ :ref:`Nightly benchmarks ` - - -.. _performance_benchmarks: - -Performance Benchmarks ----------------------- - -The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. - -The latest performance results are hosted on the public `vLLM Performance Dashboard `_. - -More information on the performance benchmarks and their parameters can be found `here `__. - -.. _nightly_benchmarks: - -Nightly Benchmarks ------------------- - -These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. - -The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. - -More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/docs/source/quantization/auto_awq.md b/docs/source/quantization/auto_awq.md new file mode 100644 index 0000000000000..c02fbf0605a8c --- /dev/null +++ b/docs/source/quantization/auto_awq.md @@ -0,0 +1,78 @@ +(auto-awq)= + +# AutoAWQ + +```{warning} +Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better +accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency +inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. +``` + +To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). +Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +The main benefits are lower latency and memory usage. + +You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). + +```console +$ pip install autoawq +``` + +After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: + +```python +from awq import AutoAWQForCausalLM +from transformers import AutoTokenizer + +model_path = 'mistralai/Mistral-7B-Instruct-v0.2' +quant_path = 'mistral-instruct-v0.2-awq' +quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + +# Load model +model = AutoAWQForCausalLM.from_pretrained( + model_path, **{"low_cpu_mem_usage": True, "use_cache": False} +) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + +# Quantize +model.quantize(tokenizer, quant_config=quant_config) + +# Save quantized model +model.save_quantized(quant_path) +tokenizer.save_pretrained(quant_path) + +print(f'Model is quantized and saved at "{quant_path}"') +``` + +To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: + +```console +$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +``` + +AWQ models are also supported directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst deleted file mode 100644 index 8eb6fa2f4cbe1..0000000000000 --- a/docs/source/quantization/auto_awq.rst +++ /dev/null @@ -1,79 +0,0 @@ -.. _auto_awq: - -AutoAWQ -================== - -.. warning:: - - Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better - accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency - inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. - -To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. -Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. -The main benefits are lower latency and memory usage. - -You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. - -.. code-block:: console - - $ pip install autoawq - -After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: - -.. code-block:: python - - from awq import AutoAWQForCausalLM - from transformers import AutoTokenizer - - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } - - # Load model - model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} - ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # Quantize - model.quantize(tokenizer, quant_config=quant_config) - - # Save quantized model - model.save_quantized(quant_path) - tokenizer.save_pretrained(quant_path) - - print(f'Model is quantized and saved at "{quant_path}"') - -To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: - -.. code-block:: console - - $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq - -AWQ models are also supported directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md new file mode 100644 index 0000000000000..8240eca1c7e03 --- /dev/null +++ b/docs/source/quantization/bnb.md @@ -0,0 +1,39 @@ +(bits-and-bytes)= + +# BitsAndBytes + +vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference. +BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. +Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. + +Below are the steps to utilize BitsAndBytes with vLLM. + +```console +$ pip install bitsandbytes>=0.45.0 +``` + +vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. + +You can find bitsandbytes quantized models on . +And usually, these repositories have a config.json file that includes a quantization_config section. + +## Read quantized checkpoint. + +```python +from vllm import LLM +import torch +# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. +model_id = "unsloth/tinyllama-bnb-4bit" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` + +## Inflight quantization: load as 4bit quantization + +```python +from vllm import LLM +import torch +model_id = "huggyllama/llama-7b" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst deleted file mode 100644 index 84f805bb60c2a..0000000000000 --- a/docs/source/quantization/bnb.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. _bits_and_bytes: - -BitsAndBytes -================== - -vLLM now supports `BitsAndBytes `_ for more efficient model inference. -BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. -Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. - -Below are the steps to utilize BitsAndBytes with vLLM. - -.. code-block:: console - - $ pip install bitsandbytes>=0.45.0 - -vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. - -You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes. -And usually, these repositories have a config.json file that includes a quantization_config section. - -Read quantized checkpoint. --------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. - model_id = "unsloth/tinyllama-bnb-4bit" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - -Inflight quantization: load as 4bit quantization ------------------------------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - model_id = "huggyllama/llama-7b" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - diff --git a/docs/source/quantization/fp8.md b/docs/source/quantization/fp8.md new file mode 100644 index 0000000000000..b2eda74fd1e3b --- /dev/null +++ b/docs/source/quantization/fp8.md @@ -0,0 +1,192 @@ +(fp8)= + +# FP8 W8A8 + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. +Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. + +Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127). + +The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: + +- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. +- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. + +```{note} +FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). +FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. +``` + +## Quick Start with Online Dynamic Quantization + +Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor. + +In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. + +```python +from vllm import LLM +model = LLM("facebook/opt-125m", quantization="fp8") +# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB +result = model.generate("Hello, my name is") +``` + +```{warning} +Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. +``` + +## Installation + +To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves three main steps: + +1. Loading the model +2. Applying quantization +3. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Applying Quantization + +For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses: + +- Static, per-channel quantization on the weights +- Dynamic, per-token quantization on the activations + +Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +# Configure the simple PTQ quantization +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + +# Apply the quantization algorithm. +oneshot(model=model, recipe=recipe) + +# Save the model. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) +``` + +### 3. Evaluating Accuracy + +Install `vllm` and `lm-evaluation-harness`: + +```console +$ pip install vllm lm-eval==0.4.4 +``` + +Load and run the model in `vllm`: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +model.generate("Hello my name is") +``` + +Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. +``` + +```console +$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic +$ lm_eval \ + --model vllm \ + --model_args pretrained=$MODEL,add_bos_token=True \ + --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 +``` + +Here's an example of the resulting scores: + +```text +|Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| +|-----|------:|----------------|-----:|-----------|---|----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| +| | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| +``` + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. + +## Deprecated Flow + +```{note} +The following information is preserved for reference and search purposes. +The quantization method described below is deprecated in favor of the `llmcompressor` method described above. +``` + +For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8). + +```bash +git clone https://github.com/neuralmagic/AutoFP8.git +pip install -e AutoFP8 +``` + +This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed. + +## Offline Quantization with Static Activation Scaling Factors + +You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument. + +```python +from datasets import load_dataset +from transformers import AutoTokenizer +from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + +pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" +quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" + +tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) +tokenizer.pad_token = tokenizer.eos_token + +# Load and tokenize 512 dataset samples for calibration of activation scales +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) +examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] +examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") + +# Define quantization config with static activation scales +quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") + +# Load the model, quantize, and save checkpoint +model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) +model.quantize(examples) +model.save_quantized(quantized_model_dir) +``` + +Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`. +Finally, you can load the quantized model checkpoint directly in vLLM. + +```python +from vllm import LLM +model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") +# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB +result = model.generate("Hello, my name is") +``` diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst deleted file mode 100644 index 4dbf8e9d346e1..0000000000000 --- a/docs/source/quantization/fp8.rst +++ /dev/null @@ -1,204 +0,0 @@ -.. _fp8: - -FP8 W8A8 -================== - -vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. -Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. -Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. -Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. - -Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. - -The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: - -- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``. -- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values. - -.. note:: - - FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). - FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. - -Quick Start with Online Dynamic Quantization --------------------------------------------- - -Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor. - -In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. - -.. code-block:: python - - from vllm import LLM - model = LLM("facebook/opt-125m", quantization="fp8") - # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB - result = model.generate("Hello, my name is") - -.. warning:: - - Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. - -Installation ------------- - -To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves three main steps: - -1. Loading the model -2. Applying quantization -3. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses: - -- Static, per-channel quantization on the weights -- Dynamic, per-token quantization on the activations - -Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import QuantizationModifier - - # Configure the simple PTQ quantization - recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) - - # Apply the quantization algorithm. - oneshot(model=model, recipe=recipe) - - # Save the model. - SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" - model.save_pretrained(SAVE_DIR) - tokenizer.save_pretrained(SAVE_DIR) - -3. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -Install ``vllm`` and ``lm-evaluation-harness``: - -.. code-block:: console - - $ pip install vllm lm-eval==0.4.4 - -Load and run the model in ``vllm``: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") - model.generate("Hello my name is") - -Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``): - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations. - -.. code-block:: console - - $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic - $ lm_eval \ - --model vllm \ - --model_args pretrained=$MODEL,add_bos_token=True \ - --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 - -Here's an example of the resulting scores: - -.. code-block:: text - - |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| - |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| - |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| - | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. - - -Deprecated Flow ------------------- - -.. note:: - - The following information is preserved for reference and search purposes. - The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above. - -For static per-tensor offline quantization to FP8, please install the `AutoFP8 library `_. - -.. code-block:: bash - - git clone https://github.com/neuralmagic/AutoFP8.git - pip install -e AutoFP8 - -This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. - -Offline Quantization with Static Activation Scaling Factors ------------------------------------------------------------ - -You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. - -.. code-block:: python - - from datasets import load_dataset - from transformers import AutoTokenizer - from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig - - pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" - quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) - tokenizer.pad_token = tokenizer.eos_token - - # Load and tokenize 512 dataset samples for calibration of activation scales - ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) - examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] - examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") - - # Define quantization config with static activation scales - quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") - - # Load the model, quantize, and save checkpoint - model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - model.quantize(examples) - model.save_quantized(quantized_model_dir) - -Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``. -Finally, you can load the quantized model checkpoint directly in vLLM. - -.. code-block:: python - - from vllm import LLM - model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") - # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB - result = model.generate("Hello, my name is") - diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md new file mode 100644 index 0000000000000..f200c722d1d42 --- /dev/null +++ b/docs/source/quantization/fp8_e4m3_kvcache.md @@ -0,0 +1,44 @@ +(fp8-e4m3-kvcache)= + +# FP8 E4M3 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, +improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 +(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of +the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of +FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside +each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling +factors of a finer granularity (e.g. per-channel). + +These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If +this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an +unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). + +To install AMMO (AlgorithMic Model Optimization): + +```console +$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +``` + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon +offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. +Thus, LLM inference is greatly accelerated with minimal accuracy loss. + +Here is an example of how to enable this feature: + +```python +# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to +# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. + +from vllm import LLM, SamplingParams +sampling_params = SamplingParams(temperature=1.3, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) + +# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, +# output w/o scaling factors: England, located in the southeastern part of the country. It is known +``` diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst deleted file mode 100644 index cc52d8f40af8f..0000000000000 --- a/docs/source/quantization/fp8_e4m3_kvcache.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. _fp8_e4m3_kvcache: - -FP8 E4M3 KV Cache -================== - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -.. code-block:: console - - $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - - -Here is an example of how to enable this feature: - -.. code-block:: python - - # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to - # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. - - from vllm import LLM, SamplingParams - sampling_params = SamplingParams(temperature=1.3, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") - prompt = "London is the capital of" - out = llm.generate(prompt, sampling_params)[0].outputs[0].text - print(out) - - # output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, - # output w/o scaling factors: England, located in the southeastern part of the country. It is known - diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md new file mode 100644 index 0000000000000..3a81ab17f332f --- /dev/null +++ b/docs/source/quantization/fp8_e5m2_kvcache.md @@ -0,0 +1,31 @@ +(fp8-kv-cache)= + +# FP8 E5M2 KV Cache + +The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. + +Here is an example of how to enable this feature: + +```python +from vllm import LLM, SamplingParams +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Create an LLM. +llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst deleted file mode 100644 index b2d824427f786..0000000000000 --- a/docs/source/quantization/fp8_e5m2_kvcache.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _fp8_kv_cache: - -FP8 E5M2 KV Cache -================== - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -.. code-block:: python - - from vllm import LLM, SamplingParams - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - diff --git a/docs/source/quantization/gguf.md b/docs/source/quantization/gguf.md new file mode 100644 index 0000000000000..eebf11dfc1b2b --- /dev/null +++ b/docs/source/quantization/gguf.md @@ -0,0 +1,72 @@ +(gguf)= + +# GGUF + +```{warning} +Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. +``` + +```{warning} +Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. +``` + +To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: + +```console +$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` + +You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: + +```console +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +``` + +```{warning} +We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. +``` + +You can also use the GGUF model directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# In this script, we demonstrate how to pass input to the chat method: +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.chat(conversation, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/gguf.rst b/docs/source/quantization/gguf.rst deleted file mode 100644 index 9f00dc5563909..0000000000000 --- a/docs/source/quantization/gguf.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. _gguf: - -GGUF -================== - -.. warning:: - - Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. - -.. warning:: - - Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split `_ tool to merge them to a single-file model. - -To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF `_ with the following command: - -.. code-block:: console - - $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 - -You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs: - -.. code-block:: console - - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 - -.. warning:: - - We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. - -You can also use the GGUF model directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # In this script, we demonstrate how to pass input to the chat method: - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.chat(conversation, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/int8.md b/docs/source/quantization/int8.md new file mode 100644 index 0000000000000..1ac50ba987dda --- /dev/null +++ b/docs/source/quantization/int8.md @@ -0,0 +1,136 @@ +(int8)= + +# INT8 W8A8 + +vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). + +```{note} +INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). +``` + +## Prerequisites + +To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Preparing Calibration Data + +When quantizing activations to INT8, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: + +```python +from datasets import load_dataset + +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) + +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` + +### 3. Applying Quantization + +Now, apply the quantization algorithms: + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + +# Configure the quantization algorithms +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), +] + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save the compressed model +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +This process creates a W8A8 model with weights and activations quantized to 8-bit integers. + +### 4. Evaluating Accuracy + +After quantization, you can load and run the model in vLLM: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") +``` + +To evaluate accuracy, you can use `lm_eval`: + +```console +$ lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' +``` + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. +``` + +## Best Practices + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst deleted file mode 100644 index aa5b251becb1c..0000000000000 --- a/docs/source/quantization/int8.rst +++ /dev/null @@ -1,145 +0,0 @@ -.. _int8: - -INT8 W8A8 -================== - -vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. -This quantization method is particularly useful for reducing model size while maintaining good performance. - -Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM `_. - -.. note:: - - INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). - -Prerequisites -------------- - -To use INT8 quantization with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves four main steps: - -1. Loading the model -2. Preparing calibration data -3. Applying quantization -4. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", - ) - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Preparing Calibration Data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When quantizing activations to INT8, you need sample data to estimate the activation scales. -It's best to use calibration data that closely matches your deployment data. -For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``: - -.. code-block:: python - - from datasets import load_dataset - - NUM_CALIBRATION_SAMPLES = 512 - MAX_SEQUENCE_LENGTH = 2048 - - # Load and preprocess the dataset - ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") - ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - - def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} - ds = ds.map(preprocess) - - def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) - ds = ds.map(tokenize, remove_columns=ds.column_names) - -3. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -Now, apply the quantization algorithms: - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import GPTQModifier - from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - - # Configure the quantization algorithms - recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), - ] - - # Apply quantization - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - ) - - # Save the compressed model - SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - -This process creates a W8A8 model with weights and activations quantized to 8-bit integers. - -4. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -After quantization, you can load and run the model in vLLM: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") - -To evaluate accuracy, you can use ``lm_eval``: - -.. code-block:: console - - $ lm_eval --model vllm \ - --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ - --tasks gsm8k \ - --num_fewshot 5 \ - --limit 250 \ - --batch_size 'auto' - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations. - -Best Practices --------------- - -- Start with 512 samples for calibration data (increase if accuracy drops) -- Use a sequence length of 2048 as a starting point -- Employ the chat template or instruction template that the model was trained with -- If you've fine-tuned a model, consider using a sample of your training data for calibration - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.md similarity index 82% rename from docs/source/quantization/supported_hardware.rst rename to docs/source/quantization/supported_hardware.md index 09f8e7112cf0c..843ee21627d78 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.md @@ -1,132 +1,132 @@ -.. _supported_hardware_for_quantization: - -Supported Hardware for Quantization Kernels -=========================================== - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -.. list-table:: - :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 - - * - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU - * - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - * - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - * - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - -Notes: -^^^^^^ - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- "✅︎" indicates that the quantization method is supported on the specified hardware. -- "✗" indicates that the quantization method is not supported on the specified hardware. - -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. +(supported-hardware-for-quantization)= + +# Supported Hardware for Quantization Kernels + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +```{eval-rst} +.. list-table:: + :header-rows: 1 + :widths: 20 8 8 8 8 8 8 8 8 8 8 + + * - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU + * - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + * - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + * - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +``` + +## Notes: + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅︎" indicates that the quantization method is supported on the specified hardware. +- "✗" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/serving/deploying_with_bentoml.md new file mode 100644 index 0000000000000..dfa0de4f0f6d7 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.md @@ -0,0 +1,7 @@ +(deploying-with-bentoml)= + +# Deploying with BentoML + +[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst deleted file mode 100644 index 4b9d19f5bdb72..0000000000000 --- a/docs/source/serving/deploying_with_bentoml.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_bentoml: - -Deploying with BentoML -====================== - -`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. - -For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md new file mode 100644 index 0000000000000..4863936236119 --- /dev/null +++ b/docs/source/serving/deploying_with_cerebrium.md @@ -0,0 +1,109 @@ +(deploying-with-cerebrium)= + +# Deploying with Cerebrium + +```{raw} html +

+ vLLM_plus_cerebrium +

+``` + +vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +```console +$ pip install cerebrium +$ cerebrium login +``` + +Next, create your Cerebrium project, run: + +```console +$ cerebrium init vllm-project +``` + +Next, to install the required packages, add the following to your cerebrium.toml: + +```toml +[cerebrium.deployment] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + +[cerebrium.dependencies.pip] +vllm = "latest" +``` + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: + +```python +from vllm import LLM, SamplingParams + +llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + +def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} +``` + +Then, run the following code to deploy it to the cloud + +```console +$ cerebrium deploy +``` + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +```python +curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' +``` + +You should get a response like: + +```python +{ + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 +} +``` + +You now have an autoscaling endpoint where you only pay for the compute you use! diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst deleted file mode 100644 index 9585b6ef5cb38..0000000000000 --- a/docs/source/serving/deploying_with_cerebrium.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. _deploying_with_cerebrium: - -Deploying with Cerebrium -============================ - -.. raw:: html - -

- vLLM_plus_cerebrium -

- -vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. - -To install the Cerebrium client, run: - -.. code-block:: console - - $ pip install cerebrium - $ cerebrium login - -Next, create your Cerebrium project, run: - -.. code-block:: console - - $ cerebrium init vllm-project - -Next, to install the required packages, add the following to your cerebrium.toml: - -.. code-block:: toml - - [cerebrium.deployment] - docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" - - [cerebrium.dependencies.pip] - vllm = "latest" - -Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") - - def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): - - sampling_params = SamplingParams(temperature=temperature, top_p=top_p) - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - results = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - results.append({"prompt": prompt, "generated_text": generated_text}) - - return {"results": results} - - -Then, run the following code to deploy it to the cloud - -.. code-block:: console - - $ cerebrium deploy - -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) - -.. code-block:: python - - curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ - -H 'Content-Type: application/json' \ - -H 'Authorization: ' \ - --data '{ - "prompts": [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is" - ] - }' - -You should get a response like: - -.. code-block:: python - - { - "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", - "result": { - "result": [ - { - "prompt": "Hello, my name is", - "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" - }, - { - "prompt": "The president of the United States is", - "generated_text": " elected every four years. This is a democratic system.\n\n5. What" - }, - { - "prompt": "The capital of France is", - "generated_text": " Paris.\n" - }, - { - "prompt": "The future of AI is", - "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." - } - ] - }, - "run_time_ms": 152.53663063049316 - } - -You now have an autoscaling endpoint where you only pay for the compute you use! - diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md new file mode 100644 index 0000000000000..844bd27800c7a --- /dev/null +++ b/docs/source/serving/deploying_with_docker.md @@ -0,0 +1,81 @@ +(deploying-with-docker)= + +# Deploying with Docker + +## Use vLLM's Official Docker Image + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +```{note} +You can either use the `ipc=host` flag or `--shm-size` flag to allow the +container to access the host's shared memory. vLLM uses PyTorch, which uses shared +memory to share data between processes under the hood, particularly for tensor parallel inference. +``` + +## Building vLLM's Docker Image from Source + +You can build and run vLLM from source via the provided . To build vLLM: + +```console +$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +``` + +```{note} +By default vLLM will build for all GPU types for widest distribution. If you are just building for the +current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` +for vLLM to find the current GPU type and build for that. +``` + +## Building for Arm64/aarch64 + +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use +of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. + +```{note} +Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` +flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. +Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). +``` + +```console +# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) +$ python3 use_existing_torch.py +$ DOCKER_BUILDKIT=1 docker build . \ + --target vllm-openai \ + --platform "linux/arm64" \ + -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" +``` + +## Use the custom-built vLLM Docker image + +To run vLLM with the custom-built Docker image: + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai +``` + +The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). + +```{note} +**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . +``` diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst deleted file mode 100644 index b64eef819cd2e..0000000000000 --- a/docs/source/serving/deploying_with_docker.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _deploying_with_docker: - -Deploying with Docker -============================ - -Use vLLM's Official Docker Image --------------------------------- - -vLLM offers an official Docker image for deployment. -The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 - - -.. note:: - - You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the - container to access the host's shared memory. vLLM uses PyTorch, which uses shared - memory to share data between processes under the hood, particularly for tensor parallel inference. - - -Building vLLM's Docker Image from Source ----------------------------------------- - -You can build and run vLLM from source via the provided `Dockerfile `_. To build vLLM: - -.. code-block:: console - - $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 - $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai - -.. note:: - - By default vLLM will build for all GPU types for widest distribution. If you are just building for the - current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` - for vLLM to find the current GPU type and build for that. - -Building for Arm64/aarch64 --------------------------- - -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64. - -.. note:: - - Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=`` - flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits. - Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). - -.. code-block:: console - - # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) - $ python3 use_existing_torch.py - $ DOCKER_BUILDKIT=1 docker build . \ - --target vllm-openai \ - --platform "linux/arm64" \ - -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" - -Use the custom-built vLLM Docker image --------------------------------------- - -To run vLLM with the custom-built Docker image: - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - vllm/vllm-openai - -The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command). - -.. note:: - - **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md new file mode 100644 index 0000000000000..65ef1c0016208 --- /dev/null +++ b/docs/source/serving/deploying_with_dstack.md @@ -0,0 +1,102 @@ +(deploying-with-dstack)= + +# Deploying with dstack + +```{raw} html +

+ vLLM_plus_dstack +

+``` + +vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. + +To install dstack client, run: + +```console +$ pip install "dstack[all] +$ dstack server +``` + +Next, to configure your dstack project, run: + +```console +$ mkdir -p vllm-dstack +$ cd vllm-dstack +$ dstack init +``` + +Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: + +```yaml +type: service + +python: "3.11" +env: + - MODEL=NousResearch/Llama-2-7b-chat-hf +port: 8000 +resources: + gpu: 24GB +commands: + - pip install vllm + - vllm serve $MODEL --port 8000 +model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf +``` + +Then, run the following CLI for provisioning: + +```console +$ dstack run . -f serve.dstack.yml + +⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + +Continue? [y/n]: y +⠙ Submitting run... +⠏ Launching spicy-treefrog-1 (pulling) +spicy-treefrog-1 provisioning completed (running) +Service is published at ... +``` + +After the provisioning, you can interact with the model by using the OpenAI SDK: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.", + api_key="" +) + +completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] +) + +print(completion.choices[0].message.content) +``` + +```{note} +dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) +``` diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst deleted file mode 100644 index e1eb45b225d9c..0000000000000 --- a/docs/source/serving/deploying_with_dstack.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _deploying_with_dstack: - -Deploying with dstack -============================ - -.. raw:: html - -

- vLLM_plus_dstack -

- -vLLM can be run on a cloud based GPU machine with `dstack `__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. - -To install dstack client, run: - -.. code-block:: console - - $ pip install "dstack[all] - $ dstack server - -Next, to configure your dstack project, run: - -.. code-block:: console - - $ mkdir -p vllm-dstack - $ cd vllm-dstack - $ dstack init - -Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: - -.. code-block:: yaml - - type: service - - python: "3.11" - env: - - MODEL=NousResearch/Llama-2-7b-chat-hf - port: 8000 - resources: - gpu: 24GB - commands: - - pip install vllm - - vllm serve $MODEL --port 8000 - model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf - -Then, run the following CLI for provisioning: - -.. code-block:: console - - $ dstack run . -f serve.dstack.yml - - ⠸ Getting run plan... - Configuration serve.dstack.yml - Project deep-diver-main - User deep-diver - Min resources 2..xCPU, 8GB.., 1xGPU (24GB) - Max price - - Max duration - - Spot policy auto - Retry policy no - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - ... - Shown 3 of 193 offers, $5.876 max - - Continue? [y/n]: y - ⠙ Submitting run... - ⠏ Launching spicy-treefrog-1 (pulling) - spicy-treefrog-1 provisioning completed (running) - Service is published at ... - -After the provisioning, you can interact with the model by using the OpenAI SDK: - -.. code-block:: python - - from openai import OpenAI - - client = OpenAI( - base_url="https://gateway.", - api_key="" - ) - - completion = client.chat.completions.create( - model="NousResearch/Llama-2-7b-chat-hf", - messages=[ - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming.", - } - ] - ) - - print(completion.choices[0].message.content) - -.. note:: - - dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository `__ diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.md similarity index 88% rename from docs/source/serving/deploying_with_helm.rst rename to docs/source/serving/deploying_with_helm.md index d185a6951d7ec..3b26575827011 100644 --- a/docs/source/serving/deploying_with_helm.rst +++ b/docs/source/serving/deploying_with_helm.md @@ -1,7 +1,6 @@ -.. _deploying_with_helm: +(deploying-with-helm)= -Deploying with Helm -=================== +# Deploying with Helm A Helm chart to deploy vLLM for Kubernetes @@ -9,44 +8,42 @@ Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file. -Prerequisites -------------- +## Prerequisites + Before you begin, ensure that you have the following: - A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin `__ +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) - Available GPU resources in your cluster - S3 with the model which will be deployed -Installing the chart --------------------- - -To install the chart with the release name ``test-vllm``: - -.. code-block:: console +## Installing the chart - helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +To install the chart with the release name `test-vllm`: -Uninstalling the Chart ----------------------- +```console +helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +``` -To uninstall the ``test-vllm`` deployment: +## Uninstalling the Chart -.. code-block:: console +To uninstall the `test-vllm` deployment: - helm uninstall test-vllm --namespace=ns-vllm +```console +helm uninstall test-vllm --namespace=ns-vllm +``` The command removes all the Kubernetes components associated with the chart **including persistent volumes** and deletes the release. -Architecture ------------- +## Architecture -.. image:: architecture_helm_deployment.png +```{image} architecture_helm_deployment.png +``` -Values ------- +## Values +```{eval-rst} .. list-table:: Values :widths: 25 25 25 25 :header-rows: 1 @@ -251,3 +248,4 @@ Values - string - test - Release name +``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md new file mode 100644 index 0000000000000..d27db826cd006 --- /dev/null +++ b/docs/source/serving/deploying_with_k8s.md @@ -0,0 +1,171 @@ +(deploying-with-k8s)= + +# Deploying with Kubernetes + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +## Deployment Steps + +1. **Create a PVC , Secret and Deployment for vLLM** + +PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mistral-7b + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem +``` + +Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret + namespace: default +type: Opaque +data: + token: "REPLACE_WITH_TOKEN" +``` + +Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 +``` + +2. **Create a Kubernetes Service for vLLM** + +Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: mistral-7b + namespace: default +spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP +``` + +3. **Deploy and Test** + +Apply the deployment and service configurations using `kubectl apply -f `: + +```console +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +``` + +To test the deployment, run the following `curl` command: + +```console +curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' +``` + +If the service is correctly deployed, you should receive a response from the vLLM model. + +## Conclusion + +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst deleted file mode 100644 index cc3606f0df851..0000000000000 --- a/docs/source/serving/deploying_with_k8s.rst +++ /dev/null @@ -1,175 +0,0 @@ -.. _deploying_with_k8s: - -Deploying with Kubernetes -========================== - -Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. - -Prerequisites -------------- -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` -- Available GPU resources in your cluster - -Deployment Steps ----------------- - -1. **Create a PVC , Secret and Deployment for vLLM** - - -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -.. code-block:: yaml - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: mistral-7b - namespace: default - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -.. code-block:: yaml - - apiVersion: v1 - kind: Secret - metadata: - name: hf-token-secret - namespace: default - type: Opaque - data: - token: "REPLACE_WITH_TOKEN" - - -Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: - -.. code-block:: yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b - spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 - -2. **Create a Kubernetes Service for vLLM** - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - name: mistral-7b - namespace: default - spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP - -3. **Deploy and Test** - -Apply the deployment and service configurations using ``kubectl apply -f ``: - -.. code-block:: console - - kubectl apply -f deployment.yaml - kubectl apply -f service.yaml - -To test the deployment, run the following ``curl`` command: - -.. code-block:: console - - curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' - -If the service is correctly deployed, you should receive a response from the vLLM model. - -Conclusion ----------- -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/serving/deploying_with_kserve.md new file mode 100644 index 0000000000000..feaeb5d0ec8a2 --- /dev/null +++ b/docs/source/serving/deploying_with_kserve.md @@ -0,0 +1,7 @@ +(deploying-with-kserve)= + +# Deploying with KServe + +vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. + +Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst deleted file mode 100644 index 01d7ccc6e9300..0000000000000 --- a/docs/source/serving/deploying_with_kserve.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_kserve: - -Deploying with KServe -============================ - -vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. - -Please see `this guide `_ for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/serving/deploying_with_kubeai.md new file mode 100644 index 0000000000000..3609d7e05acd3 --- /dev/null +++ b/docs/source/serving/deploying_with_kubeai.md @@ -0,0 +1,15 @@ +(deploying-with-kubeai)= + +# Deploying with KubeAI + +[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. + +Please see the Installation Guides for environment specific instructions: + +- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/) +- [EKS](https://www.kubeai.org/installation/eks/) +- [GKE](https://www.kubeai.org/installation/gke/) + +Once you have KubeAI installed, you can +[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/) +using vLLM. diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst deleted file mode 100644 index ec3c065320fd9..0000000000000 --- a/docs/source/serving/deploying_with_kubeai.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _deploying_with_kubeai: - -Deploying with KubeAI -===================== - -`KubeAI `_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. - - -Please see the Installation Guides for environment specific instructions: - -* `Any Kubernetes Cluster `_ -* `EKS `_ -* `GKE `_ - -Once you have KubeAI installed, you can -`configure text generation models `_ -using vLLM. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/serving/deploying_with_lws.md new file mode 100644 index 0000000000000..22bab419eaca3 --- /dev/null +++ b/docs/source/serving/deploying_with_lws.md @@ -0,0 +1,11 @@ +(deploying-with-lws)= + +# Deploying with LWS + +LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. +A major use case is for multi-host/multi-node distributed inference. + +vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving. + +Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on +deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_lws.rst b/docs/source/serving/deploying_with_lws.rst deleted file mode 100644 index b63a432dde0d5..0000000000000 --- a/docs/source/serving/deploying_with_lws.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _deploying_with_lws: - -Deploying with LWS -============================ - -LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. -A major use case is for multi-host/multi-node distributed inference. - -vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. - -Please see `this guide `_ for more details on -deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md new file mode 100644 index 0000000000000..a1f00d8536465 --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.md @@ -0,0 +1,133 @@ +(nginxloadbalancer)= + +# Deploying with Nginx Loadbalancer + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +1. [Build Nginx Container](#nginxloadbalancer-nginx-build) +2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf) +3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container) +4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network) +5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container) +6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx) +7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx) + +(nginxloadbalancer-nginx-build)= + +## Build Nginx Container + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +```console +export vllm_root=`pwd` +``` + +Create a file named `Dockerfile.nginx`: + +```console +FROM nginx:latest +RUN rm /etc/nginx/conf.d/default.conf +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] +``` + +Build the container: + +```console +docker build . -f Dockerfile.nginx --tag nginx-lb +``` + +(nginxloadbalancer-nginx-conf)= + +## Create Simple Nginx Config file + +Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. + +```console +upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; +} +server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } +} +``` + +(nginxloadbalancer-nginx-vllm-container)= + +## Build vLLM Container + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm +``` + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +``` + +(nginxloadbalancer-nginx-docker-network)= + +## Create Docker Network + +```console +docker network create vllm_nginx +``` + +(nginxloadbalancer-nginx-launch-container)= + +## Launch vLLM Containers + +Notes: + +- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below. +- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again. +- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. +- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. + +```console +mkdir -p ~/.cache/huggingface/hub/ +hf_cache_dir=~/.cache/huggingface/ +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf +``` + +```{note} +If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. +``` + +(nginxloadbalancer-nginx-launch-nginx)= + +## Launch Nginx + +```console +docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest +``` + +(nginxloadbalancer-nginx-verify-nginx)= + +## Verify That vLLM Servers Are Ready + +```console +docker logs vllm0 | grep Uvicorn +docker logs vllm1 | grep Uvicorn +``` + +Both outputs should look like this: + +```console +INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +``` diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst deleted file mode 100644 index b5dff02b6bae6..0000000000000 --- a/docs/source/serving/deploying_with_nginx.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. _nginxloadbalancer: - -Deploying with Nginx Loadbalancer -================================= - -This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. - -Table of contents: - -#. :ref:`Build Nginx Container ` -#. :ref:`Create Simple Nginx Config file ` -#. :ref:`Build vLLM Container ` -#. :ref:`Create Docker Network ` -#. :ref:`Launch vLLM Containers ` -#. :ref:`Launch Nginx ` -#. :ref:`Verify That vLLM Servers Are Ready ` - -.. _nginxloadbalancer_nginx_build: - -Build Nginx Container ---------------------- - -This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. - -.. code-block:: console - - export vllm_root=`pwd` - -Create a file named ``Dockerfile.nginx``: - -.. code-block:: console - - FROM nginx:latest - RUN rm /etc/nginx/conf.d/default.conf - EXPOSE 80 - CMD ["nginx", "-g", "daemon off;"] - -Build the container: - -.. code-block:: console - - docker build . -f Dockerfile.nginx --tag nginx-lb - -.. _nginxloadbalancer_nginx_conf: - -Create Simple Nginx Config file -------------------------------- - -Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. - -.. code-block:: console - - upstream backend { - least_conn; - server vllm0:8000 max_fails=3 fail_timeout=10000s; - server vllm1:8000 max_fails=3 fail_timeout=10000s; - } - server { - listen 80; - location / { - proxy_pass http://backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - } - -.. _nginxloadbalancer_nginx_vllm_container: - -Build vLLM Container --------------------- - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm - - -If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy - -.. _nginxloadbalancer_nginx_docker_network: - -Create Docker Network ---------------------- - -.. code-block:: console - - docker network create vllm_nginx - - -.. _nginxloadbalancer_nginx_launch_container: - -Launch vLLM Containers ----------------------- - -Notes: - -* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. -* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. -* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. -* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. - -.. code-block:: console - - mkdir -p ~/.cache/huggingface/hub/ - hf_cache_dir=~/.cache/huggingface/ - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf - -.. note:: - If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. - -.. _nginxloadbalancer_nginx_launch_nginx: - -Launch Nginx ------------- - -.. code-block:: console - - docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest - -.. _nginxloadbalancer_nginx_verify_nginx: - -Verify That vLLM Servers Are Ready ----------------------------------- - -.. code-block:: console - - docker logs vllm0 | grep Uvicorn - docker logs vllm1 | grep Uvicorn - -Both outputs should look like this: - -.. code-block:: console - - INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/serving/deploying_with_triton.md new file mode 100644 index 0000000000000..9b0a6f1d54ae8 --- /dev/null +++ b/docs/source/serving/deploying_with_triton.md @@ -0,0 +1,5 @@ +(deploying-with-triton)= + +# Deploying with NVIDIA Triton + +The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst deleted file mode 100644 index 5ce7c3d03dd2d..0000000000000 --- a/docs/source/serving/deploying_with_triton.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. _deploying_with_triton: - -Deploying with NVIDIA Triton -============================ - -The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md new file mode 100644 index 0000000000000..c0a4b23f6dc70 --- /dev/null +++ b/docs/source/serving/distributed_serving.md @@ -0,0 +1,105 @@ +(distributed-serving)= + +# Distributed Inference and Serving + +## How to decide the distributed inference strategy? + +Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: + +- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. + +In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. + +After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. + +```{note} +There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. +``` + +## Details for Distributed Inference and Serving + +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. + +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. + +To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: + +```python +from vllm import LLM +llm = LLM("facebook/opt-13b", tensor_parallel_size=4) +output = llm.generate("San Franciso is a") +``` + +To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: + +```console +$ vllm serve facebook/opt-13b \ +$ --tensor-parallel-size 4 +``` + +You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: + +```console +$ vllm serve gpt2 \ +$ --tensor-parallel-size 4 \ +$ --pipeline-parallel-size 2 +``` + +## Multi-Node Inference and Serving + +If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. + +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. + +Pick a node as the head node, and run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --head \ +$ /path/to/the/huggingface/home/in/this/node +``` + +On the rest of the worker nodes, run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --worker \ +$ /path/to/the/huggingface/home/in/this/node +``` + +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. + +Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. + +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 8 \ +$ --pipeline-parallel-size 2 +``` + +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 16 +``` + +To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. + +```{warning} +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. +``` + +```{warning} +Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. + +When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. +``` diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst deleted file mode 100644 index b24ba53e59694..0000000000000 --- a/docs/source/serving/distributed_serving.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. _distributed_serving: - -Distributed Inference and Serving -================================= - -How to decide the distributed inference strategy? -------------------------------------------------- - -Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: - -- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. -- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. -- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. - -In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. - -After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. - -.. note:: - There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. - -Details for Distributed Inference and Serving ----------------------------------------------- - -vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. - -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. - -To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: - -.. code-block:: python - - from vllm import LLM - llm = LLM("facebook/opt-13b", tensor_parallel_size=4) - output = llm.generate("San Franciso is a") - -To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: - -.. code-block:: console - - $ vllm serve facebook/opt-13b \ - $ --tensor-parallel-size 4 - -You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: - -.. code-block:: console - - $ vllm serve gpt2 \ - $ --tensor-parallel-size 4 \ - $ --pipeline-parallel-size 2 - -Multi-Node Inference and Serving --------------------------------- - -If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. - -The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command. - -Pick a node as the head node, and run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --head \ - $ /path/to/the/huggingface/home/in/this/node - -On the rest of the worker nodes, run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --worker \ - $ /path/to/the/huggingface/home/in/this/node - -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. - -Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. - -After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 8 \ - $ --pipeline-parallel-size 2 - -You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 16 - -To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. - -.. warning:: - After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. - -.. warning:: - - Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. - - When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model. diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md new file mode 100644 index 0000000000000..d214c77254257 --- /dev/null +++ b/docs/source/serving/integrations.md @@ -0,0 +1,17 @@ +# Integrations + +```{toctree} +:maxdepth: 1 + +run_on_sky +deploying_with_kserve +deploying_with_kubeai +deploying_with_triton +deploying_with_bentoml +deploying_with_cerebrium +deploying_with_lws +deploying_with_dstack +serving_with_langchain +serving_with_llamaindex +serving_with_llamastack +``` diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst deleted file mode 100644 index 0dd505a739863..0000000000000 --- a/docs/source/serving/integrations.rst +++ /dev/null @@ -1,17 +0,0 @@ -Integrations ------------- - -.. toctree:: - :maxdepth: 1 - - run_on_sky - deploying_with_kserve - deploying_with_kubeai - deploying_with_triton - deploying_with_bentoml - deploying_with_cerebrium - deploying_with_lws - deploying_with_dstack - serving_with_langchain - serving_with_llamaindex - serving_with_llamastack diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md new file mode 100644 index 0000000000000..2dc78643f6d8f --- /dev/null +++ b/docs/source/serving/metrics.md @@ -0,0 +1,38 @@ +# Production Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +You can start the server using Python, or using [Docker](deploying_with_docker.md): + +```console +$ vllm serve unsloth/Llama-3.2-1B-Instruct +``` + +Then query the endpoint to get the latest metrics from the server: + +```console +$ curl http://0.0.0.0:8000/metrics + +# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. +# TYPE vllm:iteration_tokens_total histogram +vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 +vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +... +``` + +The following metrics are exposed: + +```{literalinclude} ../../../vllm/engine/metrics.py +:end-before: end-metrics-definitions +:language: python +:start-after: begin-metrics-definitions +``` diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst deleted file mode 100644 index 231111cd7b738..0000000000000 --- a/docs/source/serving/metrics.rst +++ /dev/null @@ -1,38 +0,0 @@ -Production Metrics -================== - -vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM -OpenAI compatible API server. - -You can start the server using Python, or using [Docker](deploying_with_docker.rst): - -.. code-block:: console - - $ vllm serve unsloth/Llama-3.2-1B-Instruct - -Then query the endpoint to get the latest metrics from the server: - -.. code-block:: console - - $ curl http://0.0.0.0:8000/metrics - - # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. - # TYPE vllm:iteration_tokens_total histogram - vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 - vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - ... - -The following metrics are exposed: - -.. literalinclude:: ../../../vllm/engine/metrics.py - :language: python - :start-after: begin-metrics-definitions - :end-before: end-metrics-definitions diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1bc8d32d2d161..23c66f72162d2 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,7 +2,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst): +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` @@ -30,20 +30,22 @@ print(completion.choices[0].message) We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. +- [Pooling API](#pooling-api) (`/pooling`) + - Applicable to all [pooling models](../models/pooling_models.md). - [Score API](#score-api) (`/score`) - - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= ## Chat Template @@ -63,8 +65,7 @@ and all chat requests will error. vllm serve --chat-template ./path-to-chat-template.jinja ``` -vLLM community provides a set of chat templates for popular models. You can find them in the examples -directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) +vLLM community provides a set of chat templates for popular models. You can find them under the directory. With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: @@ -179,11 +180,14 @@ The order of priorities is `command line > config file values > defaults`. (completions-api)= ### Completions API -Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details. +Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. + +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -200,18 +204,21 @@ The following extra parameters are supported: ``` (chat-api)= -### Chat Completions API +### Chat API -Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details. +Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* +Code example: + #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -230,18 +237,21 @@ The following extra parameters are supported: (embeddings-api)= ### Embeddings API -Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details. +Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api)) +If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) which will be treated as a single prompt to the model. ```{tip} -This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. +This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` +Code example: + #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -268,20 +278,31 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s (tokenizer-api)= ### Tokenizer API -The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). +Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). It consists of two endpoints: - `/tokenize` corresponds to calling `tokenizer.encode()`. - `/detokenize` corresponds to calling `tokenizer.decode()`. +(pooling-api)= +### Pooling API + +Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. + +The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. + +Code example: + (score-api)= ### Score API -The Score API applies a cross-encoder model to predict scores for sentence pairs. +Our Score API applies a cross-encoder model to predict scores for sentence pairs. Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). +Code example: + #### Single inference You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. @@ -418,7 +439,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/docs/source/serving/run_on_sky.md b/docs/source/serving/run_on_sky.md new file mode 100644 index 0000000000000..115873ae49292 --- /dev/null +++ b/docs/source/serving/run_on_sky.md @@ -0,0 +1,345 @@ +(on-cloud)= + +# Deploying and scaling up with SkyPilot + +```{raw} html +

+ vLLM +

+``` + +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). + +## Prerequisites + +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`. +- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that {code}`sky check` shows clouds or Kubernetes are enabled. + +```console +pip install skypilot-nightly +sky check +``` + +## Run on a single instance + +See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). + +```yaml +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 +``` + +Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN +``` + +Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. + +```console +(task, pid=7431) Running on public URL: https://.gradio.live +``` + +**Optional**: Serve the 70B model instead of the default 8B and use more GPU: + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct +``` + +## Scale up to multiple replicas + +SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 +``` + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +Start the serving the Llama-3 8B model on multiple replicas: + +```console +HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN +``` + +Wait until the service is ready: + +```console +watch -n10 sky serve status vllm +``` + +```{raw} html +
+Example outputs: +``` + +```console +Services +NAME VERSION UPTIME STATUS REPLICAS ENDPOINT +vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + +Service Replicas +SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION +vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +``` + +```{raw} html +
+``` + +After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: + +```console +ENDPOINT=$(sky serve status --endpoint 8081 vllm) +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' +``` + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 +``` + +This will scale the service up to when the QPS exceeds 2 for each replica. + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +To update the service with the new config: + +```console +HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN +``` + +To stop the service: + +```console +sky serve down vllm +``` + +### **Optional**: Connect a GUI to the endpoint + +It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. + +```{raw} html +
+Click to see the full GUI YAML +``` + +```yaml +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + +resources: + cpus: 2 + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + +run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log +``` + +```{raw} html +
+``` + +1. Start the chat web UI: + +```console +sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) +``` + +2. Then, we can access the GUI at the returned gradio link: + +```console +| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live +``` diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst deleted file mode 100644 index 227e6fd2a7818..0000000000000 --- a/docs/source/serving/run_on_sky.rst +++ /dev/null @@ -1,366 +0,0 @@ -.. _on_cloud: - -Deploying and scaling up with SkyPilot -================================================ - -.. raw:: html - -

- vLLM -

- -vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. - - -Prerequisites -------------- - -- Go to the `HuggingFace model page `__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`. -- Check that you have installed SkyPilot (`docs `__). -- Check that :code:`sky check` shows clouds or Kubernetes are enabled. - -.. code-block:: console - - pip install skypilot-nightly - sky check - - -Run on a single instance ------------------------- - -See the vLLM SkyPilot YAML for serving, `serving.yaml `__. - -.. code-block:: yaml - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 - -Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN - -Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. - -.. code-block:: console - - (task, pid=7431) Running on public URL: https://.gradio.live - -**Optional**: Serve the 70B model instead of the default 8B and use more GPU: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct - - -Scale up to multiple replicas ------------------------------ - -SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - -.. raw:: html - -
- -Start the serving the Llama-3 8B model on multiple replicas: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN - - -Wait until the service is ready: - -.. code-block:: console - - watch -n10 sky serve status vllm - - -.. raw:: html - -
- Example outputs: - -.. code-block:: console - - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - -.. raw:: html - -
- -After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: - -.. code-block:: console - - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' - -To enable autoscaling, you could replace the `replicas` with the following configs in `service`: - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - -This will scale the service up to when the QPS exceeds 2 for each replica. - - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - - -.. raw:: html - -
- -To update the service with the new config: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN - - -To stop the service: - -.. code-block:: console - - sky serve down vllm - - -**Optional**: Connect a GUI to the endpoint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - -It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. - -.. raw:: html - -
- Click to see the full GUI YAML - -.. code-block:: yaml - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - - resources: - cpus: 2 - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - # Install Gradio for web UI. - pip install gradio openai - - run: | - conda activate vllm - export PATH=$PATH:/sbin - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log - - -.. raw:: html - -
- -1. Start the chat web UI: - -.. code-block:: console - - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) - - -2. Then, we can access the GUI at the returned gradio link: - -.. code-block:: console - - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live - - diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md new file mode 100644 index 0000000000000..1b5756a95075a --- /dev/null +++ b/docs/source/serving/runai_model_streamer.md @@ -0,0 +1,53 @@ +(runai-model-streamer)= + +# Loading Models with Run:ai Model Streamer + +Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. +Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). + +vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. +You first need to install vLLM RunAI optional dependency: + +```console +$ pip3 install vllm[runai] +``` + +To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +``` + +To run model from AWS S3 object store run: + +```console +$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +``` + +To run model from a S3 compatible object store run: + +```console +$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +``` + +## Tunable parameters + +You can tune parameters using `--model-loader-extra-config`: + +You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. +For reading from S3, it will be the number of client instances the host is opening to the S3 server. + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +``` + +You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. +You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +``` + +```{note} +For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). +``` diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/serving_with_langchain.md new file mode 100644 index 0000000000000..96bd5943f3d64 --- /dev/null +++ b/docs/source/serving/serving_with_langchain.md @@ -0,0 +1,30 @@ +(run-on-langchain)= + +# Serving with Langchain + +vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) . + +To install langchain, run + +```console +$ pip install langchain langchain_community -q +``` + +To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. + +```python +from langchain_community.llms import VLLM + +llm = VLLM(model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # tensor_parallel_size=... # for distributed inference +) + +print(llm("What is the capital of France ?")) +``` + +Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst deleted file mode 100644 index 6440c8aad5986..0000000000000 --- a/docs/source/serving/serving_with_langchain.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _run_on_langchain: - -Serving with Langchain -============================ - -vLLM is also available via `Langchain `_ . - -To install langchain, run - -.. code-block:: console - - $ pip install langchain langchain_community -q - -To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. - -.. code-block:: python - - from langchain_community.llms import VLLM - - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference - ) - - print(llm("What is the capital of France ?")) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/serving_with_llamaindex.md new file mode 100644 index 0000000000000..98859d8e3f828 --- /dev/null +++ b/docs/source/serving/serving_with_llamaindex.md @@ -0,0 +1,26 @@ +(run-on-llamaindex)= + +# Serving with llama_index + +vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) . + +To install llamaindex, run + +```console +$ pip install llama-index-llms-vllm -q +``` + +To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. + +```python +from llama_index.llms.vllm import Vllm + +llm = Vllm( + model="microsoft/Orca-2-7b", + tensor_parallel_size=4, + max_new_tokens=100, + vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, +) +``` + +Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst deleted file mode 100644 index 038e961344e47..0000000000000 --- a/docs/source/serving/serving_with_llamaindex.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _run_on_llamaindex: - -Serving with llama_index -============================ - -vLLM is also available via `llama_index `_ . - -To install llamaindex, run - -.. code-block:: console - - $ pip install llama-index-llms-vllm -q - -To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. - -.. code-block:: python - - from llama_index.llms.vllm import Vllm - - llm = Vllm( - model="microsoft/Orca-2-7b", - tensor_parallel_size=4, - max_new_tokens=100, - vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, - ) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md new file mode 100644 index 0000000000000..71dadca7ad47c --- /dev/null +++ b/docs/source/serving/serving_with_llamastack.md @@ -0,0 +1,38 @@ +(run-on-llamastack)= + +# Serving with Llama Stack + +vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . + +To install Llama Stack, run + +```console +$ pip install llama-stack -q +``` + +## Inference using OpenAI Compatible API + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +```yaml +inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 +``` + +Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. + +## Inference via Embedded vLLM + +An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) +is also available. This is a sample of configuration using that method: + +```yaml +inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 +``` diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst deleted file mode 100644 index a2acd7b39f887..0000000000000 --- a/docs/source/serving/serving_with_llamastack.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. _run_on_llamastack: - -Serving with Llama Stack -============================ - -vLLM is also available via `Llama Stack `_ . - -To install Llama Stack, run - -.. code-block:: console - - $ pip install llama-stack -q - -Inference using OpenAI Compatible API -------------------------------------- - -Then start Llama Stack server pointing to your vLLM server with the following configuration: - -.. code-block:: yaml - - inference: - - provider_id: vllm0 - provider_type: remote::vllm - config: - url: http://127.0.0.1:8000 - -Please refer to `this guide `_ for more details on this remote vLLM provider. - -Inference via Embedded vLLM ---------------------------- - -An `inline vLLM provider -`_ -is also available. This is a sample of configuration using that method: - -.. code-block:: yaml - - inference - - provider_type: vllm - config: - model: Llama3.1-8B-Instruct - tensor_parallel_size: 4 diff --git a/docs/source/serving/tensorizer.md b/docs/source/serving/tensorizer.md new file mode 100644 index 0000000000000..d3dd29d48f730 --- /dev/null +++ b/docs/source/serving/tensorizer.md @@ -0,0 +1,16 @@ +(tensorizer)= + +# Loading Models with CoreWeave's Tensorizer + +vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). + +```{note} +Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. +``` diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst deleted file mode 100644 index 96a93db94871b..0000000000000 --- a/docs/source/serving/tensorizer.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _tensorizer: - -Loading Models with CoreWeave's Tensorizer -========================================== -vLLM supports loading models with `CoreWeave's Tensorizer `_. -vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized -at runtime extremely quickly directly to the GPU, resulting in significantly -shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. - -For more information on CoreWeave's Tensorizer, please refer to -`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. - -.. note:: - Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md new file mode 100644 index 0000000000000..3cefa12ea8a1d --- /dev/null +++ b/docs/source/usage/compatibility_matrix.md @@ -0,0 +1,468 @@ +(compatibility-matrix)= + +# Compatibility Matrix + +The tables below show mutually exclusive features and the support on some hardware. + +```{note} +Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. +``` + +## Feature x Feature + +```{raw} html + +``` + +```{list-table} + :header-rows: 1 + :stub-columns: 1 + :widths: auto + + * - Feature + - [CP](#chunked-prefill) + - [APC](#apc) + - [LoRA](#lora-adapter) + - prmpt adptr + - [SD](#spec_decode) + - CUDA graph + - pooling + - enc-dec + - logP + - prmpt logP + - async output + - multi-step + - mm + - best-of + - beam-search + - guided dec + * - [CP](#chunked-prefill) + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [APC](#apc) + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [LoRA](#lora-adapter) + - [✗](gh-pr:9057) + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - prmpt adptr + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [SD](#spec_decode) + - ✅ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - pooling + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - + - + - + - + - + - + - + - + - + - + * - enc-dec + - ✗ + - [✗](gh-issue:7366) + - ✗ + - ✗ + - [✗](gh-issue:7366) + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + * - logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + * - prmpt logP + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-pr:8199) + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - async output + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✗ + - ✅ + - [✗](gh-issue:8198) + - ✅ + - + - + - + - + - + * - mm + - ✅ + - [✗](gh-pr:8348) + - [✗](gh-pr:7199) + - ? + - ? + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-issue:6137) + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - [✗](gh-issue:7968) + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-issue:6137) + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - [✗](gh-issue:7968>) + - ? + - ✅ + - + - + * - guided dec + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - ✅ + - [✗](gh-issue:9893) + - ? + - ✅ + - ✅ + - + +``` + +### Feature x Hardware + +```{list-table} + :header-rows: 1 + :stub-columns: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - [CP](#chunked-prefill) + - [✗](gh-issue:2729) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - [APC](#apc) + - [✗](gh-issue:3687) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - [LoRA](#lora-adapter) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-pr:4830) + - ✅ + * - prmpt adptr + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-issue:8475) + - ✅ + * - [SD](#spec_decode) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - pooling + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + * - enc-dec + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + * - mm + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - prmpt logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - async output + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](gh-issue:8477) + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - guided dec + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ +``` diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst deleted file mode 100644 index 04dd72b1e3527..0000000000000 --- a/docs/source/usage/compatibility_matrix.rst +++ /dev/null @@ -1,468 +0,0 @@ -.. _compatibility_matrix: - -Compatibility Matrix -==================== - -The tables below show mutually exclusive features and the support on some hardware. - -.. note:: - - Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. - -Feature x Feature ------------------ - - -.. raw:: html - - - -.. list-table:: - :header-rows: 1 - :widths: auto - - * - Feature - - :ref:`CP ` - - :ref:`APC ` - - :ref:`LoRA ` - - :abbr:`prmpt adptr (Prompt Adapter)` - - :ref:`SD ` - - CUDA graph - - :abbr:`pooling (Pooling Models)` - - :abbr:`enc-dec (Encoder-Decoder Models)` - - :abbr:`logP (Logprobs)` - - :abbr:`prmpt logP (Prompt Logprobs)` - - :abbr:`async output (Async Output Processing)` - - multi-step - - :abbr:`mm (Multimodal Inputs)` - - best-of - - beam-search - - :abbr:`guided dec (Guided Decoding)` - * - :ref:`CP ` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`APC ` - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`LoRA ` - - `✗ `__ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :abbr:`prmpt adptr (Prompt Adapter)` - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`SD ` - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - * - :abbr:`pooling (Pooling Models)` - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - - - - - - - - - - - - - - - - - - - - * - :abbr:`enc-dec (Encoder-Decoder Models)` - - ✗ - - `✗ `__ - - ✗ - - ✗ - - `✗ `__ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - * - :abbr:`logP (Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - * - :abbr:`prmpt logP (Prompt Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - - - * - :abbr:`async output (Async Output Processing)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - * - multi-step - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - `✗ `__ - - ✅ - - - - - - - - - - - * - :abbr:`mm (Multimodal Inputs)` - - ✅ - - `✗ `__ - - `✗ `__ - - ? - - ? - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - - - - - - - - - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - `✗ `__ - - ✅ - - - - - - - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - `✗ `__ - - ? - - ✅ - - - - - * - :abbr:`guided dec (Guided Decoding)` - - ✅ - - ✅ - - ? - - ? - - ✅ - - ✅ - - ✗ - - ? - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ? - - ✅ - - ✅ - - - - -Feature x Hardware -^^^^^^^^^^^^^^^^^^ - -.. list-table:: - :header-rows: 1 - :widths: auto - - * - Feature - - Volta - - Turing - - Ampere - - Ada - - Hopper - - CPU - - AMD - * - :ref:`CP ` - - `✗ `__ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :ref:`APC ` - - `✗ `__ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :ref:`LoRA ` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - :abbr:`prmpt adptr (Prompt Adapter)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - :ref:`SD ` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - * - :abbr:`pooling (Pooling Models)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - * - :abbr:`enc-dec (Encoder-Decoder Models)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - * - :abbr:`mm (Multimodal Inputs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`logP (Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`prmpt logP (Prompt Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`async output (Async Output Processing)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✗ - * - multi-step - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`guided dec (Guided Decoding)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/usage/disagg_prefill.md new file mode 100644 index 0000000000000..a61c00fad1e3c --- /dev/null +++ b/docs/source/usage/disagg_prefill.md @@ -0,0 +1,64 @@ +(disagg-prefill)= + +# Disaggregated prefilling (experimental) + +This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. + +## Why disaggregated prefilling? + +Two main reasons: + +- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. +- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. + +```{note} +Disaggregated prefill DOES NOT improve throughput. +``` + +## Usage example + +Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. + +## Benchmarks + +Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks. + +## Development + +We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance. + +All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`. + +Key abstractions for disaggregated prefilling: + +- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**. +- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. +- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. + +```{note} +`insert` is non-blocking operation but `drop_select` is blocking operation. +``` + +Here is a figure illustrating how the above 3 abstractions are organized: + +```{image} /assets/usage/disagg_prefill/abstraction.jpg +:alt: Disaggregated prefilling abstractions +``` + +The workflow of disaggregated prefilling is as follows: + +```{image} /assets/usage/disagg_prefill/overview.jpg +:alt: Disaggregated prefilling workflow +``` + +The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. + +## Third-party contributions + +Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). + +We recommend three ways of implementations: + +- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. +- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL. +- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`. diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst deleted file mode 100644 index 9fe714b4fd856..0000000000000 --- a/docs/source/usage/disagg_prefill.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _disagg_prefill: - -Disaggregated prefilling (experimental) -======================================= - -This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. - -Why disaggregated prefilling? ------------------------------ - -Two main reasons: - -* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. -* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. - -.. note:: - Disaggregated prefill DOES NOT improve throughput. - -Usage example -------------- - -Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling. - - -Benchmarks ----------- - -Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks. - - -Development ------------ - -We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance. - -All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``. - -Key abstractions for disaggregated prefilling: - -* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**. -* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer. -* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``. - -.. note:: - ``insert`` is non-blocking operation but ``drop_select`` is blocking operation. - -Here is a figure illustrating how the above 3 abstractions are organized: - -.. image:: /assets/usage/disagg_prefill/abstraction.jpg - :alt: Disaggregated prefilling abstractions - -The workflow of disaggregated prefilling is as follows: - -.. image:: /assets/usage/disagg_prefill/overview.jpg - :alt: Disaggregated prefilling workflow - -The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer. - - -Third-party contributions -------------------------- - -Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). - -We recommend three ways of implementations: - -* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. -* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL. -* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`. diff --git a/docs/source/usage/engine_args.rst b/docs/source/usage/engine_args.md similarity index 76% rename from docs/source/usage/engine_args.rst rename to docs/source/usage/engine_args.md index e7ce8cdcabe88..cd3c6a430b7fa 100644 --- a/docs/source/usage/engine_args.rst +++ b/docs/source/usage/engine_args.md @@ -1,23 +1,25 @@ -.. _engine_args: +(engine-args)= -Engine Arguments -================ +# Engine Arguments Below, you can find an explanation of every engine argument for vLLM: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _engine_args_parser :prog: vllm serve :nodefaultconst: +``` -Async Engine Arguments ----------------------- +## Async Engine Arguments Below are the additional arguments related to the asynchronous engine: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _async_engine_args_parser :prog: vllm serve - :nodefaultconst: \ No newline at end of file + :nodefaultconst: +``` diff --git a/docs/source/usage/env_vars.md b/docs/source/usage/env_vars.md new file mode 100644 index 0000000000000..f9b08077a03b4 --- /dev/null +++ b/docs/source/usage/env_vars.md @@ -0,0 +1,15 @@ +# Environment Variables + +vLLM uses the following environment variables to configure the system: + +```{warning} +Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. + +All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). +``` + +```{literalinclude} ../../../vllm/envs.py +:end-before: end-env-vars-definition +:language: python +:start-after: begin-env-vars-definition +``` diff --git a/docs/source/usage/env_vars.rst b/docs/source/usage/env_vars.rst deleted file mode 100644 index ff2259c0da3f1..0000000000000 --- a/docs/source/usage/env_vars.rst +++ /dev/null @@ -1,14 +0,0 @@ -Environment Variables -======================== - -vLLM uses the following environment variables to configure the system: - -.. warning:: - Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. - - All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. - -.. literalinclude:: ../../../vllm/envs.py - :language: python - :start-after: begin-env-vars-definition - :end-before: end-env-vars-definition diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.md similarity index 61% rename from docs/source/usage/faq.rst rename to docs/source/usage/faq.md index d88da32092924..fde2954f10c59 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.md @@ -1,34 +1,33 @@ -.. _faq: +(faq)= -Frequently Asked Questions -=========================== +# Frequently Asked Questions - Q: How can I serve multiple models on a single port using the OpenAI API? +> Q: How can I serve multiple models on a single port using the OpenAI API? A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. ----------------------------------------- +______________________________________________________________________ - Q: Which model to use for offline inference embedding? +> Q: Which model to use for offline inference embedding? -A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; -more are listed :ref:`here `. +A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); +more are listed [here](#supported-models). -By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, -`Mistral-7B-Instruct-v0.3 `__ into embedding models, +By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), +[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, but they are expected be inferior to models that are specifically trained on embedding tasks. ----------------------------------------- +______________________________________________________________________ - Q: Can the output of a prompt vary across runs in vLLM? +> Q: Can the output of a prompt vary across runs in vLLM? A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to -numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, -see the `Numerical Accuracy section `_. +numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, +see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations). In vLLM, the same requests might be batched differently due to factors such as other concurrent requests, -changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, -can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in +changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, +can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. **Mitigation Strategies** diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md new file mode 100644 index 0000000000000..cf06916d70f44 --- /dev/null +++ b/docs/source/usage/lora.md @@ -0,0 +1,214 @@ +(lora-adapter)= + +# LoRA Adapters + +This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model. + +LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`. + +Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save +them locally with + +```python +from huggingface_hub import snapshot_download + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") +``` + +Then we instantiate the base model and pass in the `enable_lora=True` flag: + +```python +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) +``` + +We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter +of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and +the third parameter is the path to the LoRA adapter. + +```python +sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] +) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", +] + +outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) +) +``` + +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. + +## Serving LoRA Adapters + +LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use +`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: + +```bash +vllm serve meta-llama/Llama-2-7b-hf \ + --enable-lora \ + --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +```{note} +The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. +``` + +The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, +etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along +with its base model: + +```bash +curl localhost:8000/v1/models | jq . +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + ... + }, + { + "id": "sql-lora", + "object": "model", + ... + } + ] +} +``` + +Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be +processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other +LoRA adapter requests if they were provided and `max_loras` is set high enough). + +The following is an example request + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' | jq +``` + +## Dynamically serving LoRA Adapters + +In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading +LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility +to change models on-the-fly is needed. + +Note: Enabling this feature in production environments is risky as user may participate model adapter management. + +To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. + +```bash +export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True +``` + +Loading a LoRA Adapter: + +To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary +details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. + +Example request to load a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/load_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter", + "lora_path": "/path/to/sql-lora-adapter" +}' +``` + +Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter +cannot be found or loaded, an appropriate error message will be returned. + +Unloading a LoRA Adapter: + +To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint +with the name or ID of the adapter to be unloaded. + +Example request to unload a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/unload_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter" +}' +``` + +## New format for `--lora-modules` + +In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: + +```bash +--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. +Now, you can specify a base_model_name alongside the name and path using JSON format. For example: + +```bash +--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' +``` + +To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. + +## Lora model lineage in model card + +The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: + +- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. +- The `root` field points to the artifact location of the lora adapter. + +```bash +$ curl http://localhost:8000/v1/models + +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", + "parent": null, + "permission": [ + { + ..... + } + ] + }, + { + "id": "sql-lora", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", + "parent": meta-llama/Llama-2-7b-hf, + "permission": [ + { + .... + } + ] + } + ] +} +``` diff --git a/docs/source/usage/lora.rst b/docs/source/usage/lora.rst deleted file mode 100644 index c2c6fa2aebfaf..0000000000000 --- a/docs/source/usage/lora.rst +++ /dev/null @@ -1,225 +0,0 @@ -.. _lora: - -LoRA Adapters -============= - -This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. - -LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`. - -Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save -them locally with - -.. code-block:: python - - from huggingface_hub import snapshot_download - - sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - -Then we instantiate the base model and pass in the ``enable_lora=True`` flag: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.lora.request import LoRARequest - - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) - - -We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter -of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and -the third parameter is the path to the LoRA adapter. - -.. code-block:: python - - sampling_params = SamplingParams( - temperature=0, - max_tokens=256, - stop=["[/assistant]"] - ) - - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - ] - - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) - ) - - -Check out `examples/multilora_inference.py `_ -for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. - -Serving LoRA Adapters ---------------------- -LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use -``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server: - -.. code-block:: bash - - vllm serve meta-llama/Llama-2-7b-hf \ - --enable-lora \ - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -.. note:: - The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. - -The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``, -etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along -with its base model: - -.. code-block:: bash - - curl localhost:8000/v1/models | jq . - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - ... - }, - { - "id": "sql-lora", - "object": "model", - ... - } - ] - } - -Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be -processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other -LoRA adapter requests if they were provided and ``max_loras`` is set high enough). - -The following is an example request - -.. code-block:: bash - - curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "sql-lora", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' | jq - - -Dynamically serving LoRA Adapters ---------------------------------- - -In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading -LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility -to change models on-the-fly is needed. - -Note: Enabling this feature in production environments is risky as user may participate model adapter management. - -To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` -is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. - -.. code-block:: bash - - export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True - - -Loading a LoRA Adapter: - -To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary -details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. - -Example request to load a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/load_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter", - "lora_path": "/path/to/sql-lora-adapter" - }' - -Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter -cannot be found or loaded, an appropriate error message will be returned. - -Unloading a LoRA Adapter: - -To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint -with the name or ID of the adapter to be unloaded. - -Example request to unload a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/unload_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter" - }' - - -New format for `--lora-modules` -------------------------------- - -In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: - -.. code-block:: bash - - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. -Now, you can specify a base_model_name alongside the name and path using JSON format. For example: - -.. code-block:: bash - - --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' - -To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. - - -Lora model lineage in model card --------------------------------- - -The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: - -- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. -- The `root` field points to the artifact location of the lora adapter. - -.. code-block:: bash - - $ curl http://localhost:8000/v1/models - - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", - "parent": null, - "permission": [ - { - ..... - } - ] - }, - { - "id": "sql-lora", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", - "parent": meta-llama/Llama-2-7b-hf, - "permission": [ - { - .... - } - ] - } - ] - } diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md new file mode 100644 index 0000000000000..82a3f3b8909a1 --- /dev/null +++ b/docs/source/usage/multimodal_inputs.md @@ -0,0 +1,486 @@ +(multimodal-inputs)= + +# Multimodal Inputs + +This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. + +```{note} +We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, +and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. +``` + +## Offline Inference + +To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: + +- `prompt`: The prompt should follow the format that is documented on HuggingFace. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. + +### Image + +You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples: + +```python +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Load the image using PIL.Image +image = PIL.Image.open(...) + +# Single prompt inference +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +# Batch inference +image_1 = PIL.Image.open(...) +image_2 = PIL.Image.open(...) +outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] +) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +Full example: + +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: + +```python +llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept +) + +# Refer to the HuggingFace repo for the correct format to use +prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + +# Load the images using PIL.Image +image1 = PIL.Image.open(...) +image2 = PIL.Image.open(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +Full example: + +Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: + +```python +# Specify the maximum number of frames per video to be 4. This can be changed. +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + +# Create the request payload. +video_frames = ... # load your video making sure it only has the number of frames specified earlier. +message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], +} +for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + +# Perform inference and log output. +outputs = llm.chat([message]) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +### Video + +You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Full example: + +### Audio + +You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. + +Full example: + +### Embedding + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +```python +# Inference with image embeddings as input +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Embeddings for single image +# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +```python +# Construct the prompt based on your model +prompt = ... + +# Embeddings for multiple images +# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +# Qwen2-VL +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } +} + +# MiniCPM-V +llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } +} + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +## Online Inference + +Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). + +```{important} +A chat template is **required** to use Chat Completions API. + +Although most models come with a chat template, for others you have to define one yourself. +The chat template can be inferred based on the documentation on the model's HuggingFace repo. +For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: +``` + +### Image + +Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). +Here is a simple example using Phi-3.5-Vision. + +First, launch the OpenAI-compatible server: + +```bash +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 +``` + +Then, you can use the OpenAI client as follows: + +```python +from openai import OpenAI + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +# Single-image input inference +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) + +# Multi-image input inference +image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" +image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) +``` + +Full example: + +```{tip} +Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, +and pass the file path as `url` in the API request. +``` + +```{tip} +There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. +In fact, you can place image placeholders in the middle of the text by interleaving text and image content. +``` + +````{note} +By default, the timeout for fetching images through HTTP URL is `5` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_IMAGE_FETCH_TIMEOUT= +``` +```` + +### Video + +Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. + +You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference. + +````{note} +By default, the timeout for fetching videos through HTTP URL url is `30` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_VIDEO_FETCH_TIMEOUT= +``` +```` + +### Audio + +Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). +Here is a simple example using Ultravox-v0.3. + +First, launch the OpenAI-compatible server: + +```bash +vllm serve fixie-ai/ultravox-v0_3 +``` + +Then, you can use the OpenAI client as follows: + +```python +import base64 +import requests +from openai import OpenAI +from vllm.assets.audio import AudioAsset + +def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +# Any format supported by librosa is supported +audio_url = AudioAsset("winning_call").url +audio_base64 = encode_base64_content_from_url(audio_url) + +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, +) + +result = chat_completion_from_base64.choices[0].message.content +print("Chat completion output from input audio:", result) +``` + +Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input: + +```python +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, +) + +result = chat_completion_from_url.choices[0].message.content +print("Chat completion output from audio url:", result) +``` + +Full example: + +````{note} +By default, the timeout for fetching audios through HTTP URL is `10` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_AUDIO_FETCH_TIMEOUT= +``` +```` + +### Embedding + +vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings), +where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models. + +```{tip} +The schema of `messages` is exactly the same as in Chat Completions API. +You can refer to the above tutorials for more details on how to pass each type of multi-modal data. +``` + +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: + +```bash +vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja +``` + +```{important} +Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` +to run this model in embedding mode instead of text generation mode. + +The custom chat template is completely different from the original one for this model, +and can be found here: +``` + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + +```python +import requests + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, +) +response.raise_for_status() +response_json = response.json() +print("Embedding output:", response_json["data"][0]["embedding"]) +``` + +Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model. + +```bash +vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja +``` + +```{important} +Like with VLM2Vec, we have to explicitly pass `--task embed`. + +Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled +by a custom chat template: +``` + +```{important} +Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code +example below for details. +``` + +Full example: diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst deleted file mode 100644 index 680382e457cc5..0000000000000 --- a/docs/source/usage/multimodal_inputs.rst +++ /dev/null @@ -1,492 +0,0 @@ -.. _multimodal_inputs: - -Multimodal Inputs -================= - -This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. - -.. note:: - We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, - and `open an issue on GitHub `_ if you have any feedback or feature requests. - -Offline Inference ------------------ - -To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: - -* ``prompt``: The prompt should follow the format that is documented on HuggingFace. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. - -Image -^^^^^ - -You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Load the image using PIL.Image - image = PIL.Image.open(...) - - # Single prompt inference - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Batch inference - image_1 = PIL.Image.open(...) - image_2 = PIL.Image.open(...) - outputs = llm.generate( - [ - { - "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_1}, - }, - { - "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_2}, - } - ] - ) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language.py `_. - -To substitute multiple images inside the same text prompt, you can pass in a list of images instead: - -.. code-block:: python - - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, # Required to load Phi-3.5-vision - max_model_len=4096, # Otherwise, it may not fit in smaller GPUs - limit_mm_per_prompt={"image": 2}, # The maximum number to accept - ) - - # Refer to the HuggingFace repo for the correct format to use - prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" - - # Load the images using PIL.Image - image1 = PIL.Image.open(...) - image2 = PIL.Image.open(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. - -Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: - -.. code-block:: python - - # Specify the maximum number of frames per video to be 4. This can be changed. - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - - # Create the request payload. - video_frames = ... # load your video making sure it only has the number of frames specified earlier. - message = { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, - ], - } - for i in range(len(video_frames)): - base64_image = encode_image(video_frames[i]) # base64 encoding. - new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - message["content"].append(new_image) - - # Perform inference and log output. - outputs = llm.chat([message]) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Video -^^^^^ - -You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary -instead of using multi-image input. - -Please refer to `examples/offline_inference_vision_language.py `_ for more details. - -Audio -^^^^^ - -You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. - -Please refer to `examples/offline_inference_audio_language.py `_ for more details. - -Embedding -^^^^^^^^^ - -To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, -pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. - -.. code-block:: python - - # Inference with image embeddings as input - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Embeddings for single image - # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: - -.. code-block:: python - - # Construct the prompt based on your model - prompt = ... - - # Embeddings for multiple images - # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - # Qwen2-VL - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_grid_thw is needed to calculate positional encoding. - "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), - } - } - - # MiniCPM-V - llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_size_list is needed to calculate details of the sliced image. - "image_size_list": [image.size for image in images], # list of image sizes - } - } - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Online Inference ----------------- - -Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. - -.. important:: - A chat template is **required** to use Chat Completions API. - - Although most models come with a chat template, for others you have to define one yourself. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. - -Image -^^^^^ - -Image input is supported according to `OpenAI Vision API `_. -Here is a simple example using Phi-3.5-Vision. - -First, launch the OpenAI-compatible server: - -.. code-block:: bash - - vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 - -Then, you can use the OpenAI client as follows: - -.. code-block:: python - - from openai import OpenAI - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Single-image input inference - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - - # Multi-image input inference - image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" - image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. tip:: - Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, - and pass the file path as ``url`` in the API request. - -.. tip:: - There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. - In fact, you can place image placeholders in the middle of the text by interleaving text and image content. - -.. note:: - - By default, the timeout for fetching images through HTTP URL is ``5`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_IMAGE_FETCH_TIMEOUT= - -Video -^^^^^ - -Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. - -You can use `these tests `_ as reference. - -.. note:: - - By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_VIDEO_FETCH_TIMEOUT= - -Audio -^^^^^ - -Audio input is supported according to `OpenAI Audio API `_. -Here is a simple example using Ultravox-v0.3. - -First, launch the OpenAI-compatible server: - -.. code-block:: bash - - vllm serve fixie-ai/ultravox-v0_3 - -Then, you can use the OpenAI client as follows: - -.. code-block:: python - - import base64 - import requests - from openai import OpenAI - from vllm.assets.audio import AudioAsset - - def encode_base64_content_from_url(content_url: str) -> str: - """Encode a content retrieved from a remote url to base64 format.""" - - with requests.get(content_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Any format supported by librosa is supported - audio_url = AudioAsset("winning_call").url - audio_base64 = encode_base64_content_from_url(audio_url) - - chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" - }, - }, - ], - }], - model=model, - max_completion_tokens=64, - ) - - result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) - -Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input: - -.. code-block:: python - - chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url - }, - }, - ], - }], - model=model, - max_completion_tokens=64, - ) - - result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from audio url:", result) - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. note:: - - By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_AUDIO_FETCH_TIMEOUT= - -Embedding -^^^^^^^^^ - -vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. - -.. tip:: - The schema of ``messages`` is exactly the same as in Chat Completions API. - You can refer to the above tutorials for more details on how to pass each type of multi-modal data. - -Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. -Refer to the examples below for illustration. - -Here is an end-to-end example using VLM2Vec. To serve the model: - -.. code-block:: bash - - vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja - -.. important:: - - Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed`` - to run this model in embedding mode instead of text generation mode. - - The custom chat template is completely different from the original one for this model, - and can be found `here `__. - -Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: - -.. code-block:: python - - import requests - - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, - ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) - -Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. - -.. code-block:: bash - - vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja - -.. important:: - - Like with VLM2Vec, we have to explicitly pass ``--task embed``. - - Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled - by `this custom chat template `__. - -.. important:: - - Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code - example below for details. - -A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/usage/performance.rst b/docs/source/usage/performance.md similarity index 54% rename from docs/source/usage/performance.rst rename to docs/source/usage/performance.md index 23b5ab79a7378..f028e28627a9f 100644 --- a/docs/source/usage/performance.rst +++ b/docs/source/usage/performance.md @@ -1,16 +1,15 @@ -.. _performance: +(performance)= -Performance and Tuning -====================== +# Performance and Tuning + +## Preemption -Preemption ----------- Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: ``` -WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 +WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency. @@ -22,44 +21,44 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. -.. _chunked-prefill: +(chunked-prefill)= -Chunked Prefill ---------------- -vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. +## Chunked Prefill -You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor. +vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. -.. code-block:: python +You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor. - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) - # Set max_num_batched_tokens to tune performance. - # NOTE: 512 is the default max_num_batched_tokens for chunked prefill. - # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +```python +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) +# Set max_num_batched_tokens to tune performance. +# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. +# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +``` By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization. Once chunked prefill is enabled, the policy is changed to prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill. -When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills. -If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it. +When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. +If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it. This policy has two benefits: - It improves ITL and generation decode because decode requests are prioritized. - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. -You can tune the performance by changing ``max_num_batched_tokens``. +You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). -Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes. -Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch. +Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. +Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. -- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). -- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler. +- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). +- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. -We recommend you set ``max_num_batched_tokens > 2048`` for throughput. +We recommend you set `max_num_batched_tokens > 2048` for throughput. -See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). +See related papers for more details ( or ). -Please try out this feature and let us know your feedback via GitHub issues! \ No newline at end of file +Please try out this feature and let us know your feedback via GitHub issues! diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md new file mode 100644 index 0000000000000..8302da81b6173 --- /dev/null +++ b/docs/source/usage/spec_decode.md @@ -0,0 +1,205 @@ +(spec-decode)= + +# Speculative decoding + +```{warning} +Please note that speculative decoding in vLLM is not yet optimized and does +not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. +The work to optimize it is ongoing and can be followed here: +``` + +```{warning} +Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. +``` + +This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. +Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. + +## Speculating with a draft model + +The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +To perform the same with an online mode launch the server: + +```bash +python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 +``` + +Then use a client: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Completion API +stream = False +completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, +) + +print("Completion results:") +if stream: + for c in completion: + print(c) +else: + print(completion) +``` + +## Speculating by matching n-grams in the prompt + +The following code configures vLLM to use speculative decoding where proposals are generated by +matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="[ngram]", + num_speculative_tokens=5, + ngram_prompt_lookup_max=4, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +## Speculating using MLP speculators + +The following code configures vLLM to use speculative decoding where proposals are generated by +draft models that conditioning draft predictions on both context vectors and sampled tokens. +For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or +[this technical report](https://arxiv.org/abs/2404.19124). + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_draft_tensor_parallel_size=1, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +Note that these speculative models currently need to be run without tensor parallelism, although +it is possible to run the main model using tensor parallelism (see example above). Since the +speculative models are relatively small, we still see significant speedups. However, this +limitation will be fixed in a future release. + +A variety of speculative models of this type are available on HF hub: + +- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) +- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) +- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) +- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) + +## Lossless guarantees of Speculative Decoding + +In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of +speculative decoding, breaking down the guarantees into three key areas: + +1. **Theoretical Losslessness** + \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might + cause slight variations in output distributions, as discussed + in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318) + +2. **Algorithmic Losslessness** + \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: + + > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target + > distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252) + > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling + > without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, + > provides a lossless guarantee. Almost all of the tests in . + > verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291) + +3. **vLLM Logprob Stability** + \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the + same request across runs. For more details, see the FAQ section + titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +**Conclusion** + +While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding +can occur due to following factors: + +- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. +- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially + due to non-deterministic behavior in batched operations or numerical instability. + +**Mitigation Strategies** + +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +## Resources for vLLM contributors + +- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4) +- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a) +- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8) +- [Dynamic speculative decoding](gh-issue:4565) diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst deleted file mode 100644 index f1f1917f974bb..0000000000000 --- a/docs/source/usage/spec_decode.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. _spec_decode: - -Speculative decoding -==================== - -.. warning:: - Please note that speculative decoding in vLLM is not yet optimized and does - not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work - to optimize it is ongoing and can be followed in `this issue. `_ - -.. warning:: - Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. - -This document shows how to use `Speculative Decoding `_ with vLLM. -Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. - -Speculating with a draft model ------------------------------- - -The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="facebook/opt-125m", - num_speculative_tokens=5, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -To perform the same with an online mode launch the server: - -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ - --num_speculative_tokens 5 --gpu_memory_utilization 0.8 - -Then use a client: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, - ) - - models = client.models.list() - model = models.data[0].id - - # Completion API - stream = False - completion = client.completions.create( - model=model, - prompt="The future of AI is", - echo=False, - n=1, - stream=stream, - ) - - print("Completion results:") - if stream: - for c in completion: - print(c) - else: - print(completion) - -Speculating by matching n-grams in the prompt ---------------------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -matching n-grams in the prompt. For more information read `this thread. `_ - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="[ngram]", - num_speculative_tokens=5, - ngram_prompt_lookup_max=4, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Speculating using MLP speculators ---------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -draft models that conditioning draft predictions on both context vectors and sampled tokens. -For more information see `this blog `_ or -`this technical report `_. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="meta-llama/Meta-Llama-3.1-70B-Instruct", - tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", - speculative_draft_tensor_parallel_size=1, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Note that these speculative models currently need to be run without tensor parallelism, although -it is possible to run the main model using tensor parallelism (see example above). Since the -speculative models are relatively small, we still see significant speedups. However, this -limitation will be fixed in a future release. - -A variety of speculative models of this type are available on HF hub: - -* `llama-13b-accelerator `_ -* `llama3-8b-accelerator `_ -* `codellama-34b-accelerator `_ -* `llama2-70b-accelerator `_ -* `llama3-70b-accelerator `_ -* `granite-3b-code-instruct-accelerator `_ -* `granite-8b-code-instruct-accelerator `_ -* `granite-7b-instruct-accelerator `_ -* `granite-20b-code-instruct-accelerator `_ - -Lossless guarantees of Speculative Decoding -------------------------------------------- -In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of -speculative decoding, breaking down the guarantees into three key areas: - -1. **Theoretical Losslessness** - - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might - cause slight variations in output distributions, as discussed - in `Accelerating Large Language Model Decoding with Speculative Sampling `_ - -2. **Algorithmic Losslessness** - - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: - - - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target - distribution. `View Test Code `_ - - - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling - without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, - provides a lossless guarantee. Almost all of the tests in `this directory `_ - verify this property using `this assertion implementation `_ - -3. **vLLM Logprob Stability** - - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the - same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - - -**Conclusion** - -While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding -can occur due to following factors: - -- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. - -- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially - due to non-deterministic behavior in batched operations or numerical instability. - -**Mitigation Strategies** - -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - -Resources for vLLM contributors -------------------------------- -* `A Hacker's Guide to Speculative Decoding in vLLM `_ -* `What is Lookahead Scheduling in vLLM? `_ -* `Information on batch expansion `_ -* `Dynamic speculative decoding `_ diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md new file mode 100644 index 0000000000000..3f5d9ffc26278 --- /dev/null +++ b/docs/source/usage/structured_outputs.md @@ -0,0 +1,260 @@ +(structured-outputs)= + +# Structured Outputs + +vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. +This document shows you some examples of the different options that are available to generate structured outputs. + +## Online Inference (OpenAI API) + +You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +The following parameters are supported, which must be added as extra parameters: + +- `guided_choice`: the output will be exactly one of the choices. +- `guided_regex`: the output will follow the regex pattern. +- `guided_json`: the output will follow the JSON schema. +- `guided_grammar`: the output will follow the context free grammar. +- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. +- `guided_decoding_backend`: used to select the guided decoding backend to use. + +You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. + +Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: + +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", +) + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, +) +print(completion.choices[0].message.content) +``` + +The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: + +```python +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, +) +print(completion.choices[0].message.content) +``` + +One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. +For this we can use the `guided_json` parameter in two different ways: + +- Using directly a [JSON Schema](https://json-schema.org/) +- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). + +The next example shows how to use the `guided_json` parameter with a Pydantic model: + +```python +from pydantic import BaseModel +from enum import Enum + +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + extra_body={"guided_json": json_schema}, +) +print(completion.choices[0].message.content) +``` + +```{tip} +While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. +This can improve the results notably in most cases. +``` + +Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. +It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: + +```python +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, +) +print(completion.choices[0].message.content) +``` + +Full example: + +## Experimental Automatic Parsing (OpenAI API) + +This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types. + +At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104). + +For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct` + +Here is a simple example demonstrating how to get structured output using Pydantic models: + +```python +from pydantic import BaseModel +from openai import OpenAI + + +class Info(BaseModel): + name: str + age: int + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +print("Name:", message.parsed.name) +print("Age:", message.parsed.age) +``` + +Output: + +```console +ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) +Name: Cameron +Age: 28 +``` + +Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: + +```python +from typing import List +from pydantic import BaseModel +from openai import OpenAI + + +class Step(BaseModel): + explanation: str + output: str + + +class MathResponse(BaseModel): + steps: List[Step] + final_answer: str + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) +print("Answer:", message.parsed.final_answer) +``` + +Output: + +```console +ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) +Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' +Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' +Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' +Answer: x = -29/8 +``` + +## Offline Inference + +Offline inference allows for the same types of guided decoding. +To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. +The main available options inside `GuidedDecodingParams` are: + +- `json` +- `regex` +- `choice` +- `grammar` +- `backend` +- `whitespace_pattern` + +These parameters can be used in the same way as the parameters from the Online Inference examples above. +One example for the usage of the `choices` parameter is shown below: + +```python +from vllm import LLM, SamplingParams +from vllm.sampling_params import GuidedDecodingParams + +llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + +guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) +``` + +Full example: diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst deleted file mode 100644 index 484e1f17d191e..0000000000000 --- a/docs/source/usage/structured_outputs.rst +++ /dev/null @@ -1,267 +0,0 @@ -.. _structured_outputs: - -Structured Outputs -================== - -vLLM supports the generation of structured outputs using `outlines `_ or `lm-format-enforcer `_ as backends for the guided decoding. -This document shows you some examples of the different options that are available to generate structured outputs. - - -Online Inference (OpenAI API) ------------------------------ - -You can generate structured outputs using the OpenAI's `Completions `_ and `Chat `_ API. - -The following parameters are supported, which must be added as extra parameters: - -- ``guided_choice``: the output will be exactly one of the choices. -- ``guided_regex``: the output will follow the regex pattern. -- ``guided_json``: the output will follow the JSON schema. -- ``guided_grammar``: the output will follow the context free grammar. -- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding. -- ``guided_decoding_backend``: used to select the guided decoding backend to use. - -You can see the complete list of supported parameters on the `OpenAI Compatible Server `_ page. - -Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: - -.. code-block:: python - - from openai import OpenAI - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", - ) - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, - ) - print(completion.choices[0].message.content) - - -The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: - -.. code-block:: python - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", - } - ], - extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, - ) - print(completion.choices[0].message.content) - -One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the ``guided_json`` parameter in two different ways: - -- Using directly a `JSON Schema `_ -- Defining a `Pydantic model `_ and then extracting the JSON Schema from it (which is normally an easier option). - -The next example shows how to use the ``guided_json`` parameter with a Pydantic model: - -.. code-block:: python - - from pydantic import BaseModel - from enum import Enum - - class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - - class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - - json_schema = CarDescription.model_json_schema() - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", - } - ], - extra_body={"guided_json": json_schema}, - ) - print(completion.choices[0].message.content) - -.. tip:: - While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. - This can improve the results notably in most cases. - - -Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. -It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: - -.. code-block:: python - - simplified_sql_grammar = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ - """ - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, - ) - print(completion.choices[0].message.content) - -The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. - -Experimental Automatic Parsing (OpenAI API) --------------------------------------------- - -This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types. - -At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here `_. - -For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct`` - -Here is a simple example demonstrating how to get structured output using Pydantic models: - -.. code-block:: python - - from pydantic import BaseModel - from openai import OpenAI - - - class Info(BaseModel): - name: str - age: int - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, - ], - response_format=Info, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - print("Name:", message.parsed.name) - print("Age:", message.parsed.age) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) - Name: Cameron - Age: 28 - - -Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: - -.. code-block:: python - - from typing import List - from pydantic import BaseModel - from openai import OpenAI - - - class Step(BaseModel): - explanation: str - output: str - - - class MathResponse(BaseModel): - steps: List[Step] - final_answer: str - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful expert math tutor."}, - {"role": "user", "content": "Solve 8x + 31 = 2."}, - ], - response_format=MathResponse, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - for i, step in enumerate(message.parsed.steps): - print(f"Step #{i}:", step) - print("Answer:", message.parsed.final_answer) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) - Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' - Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' - Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' - Answer: x = -29/8 - -Offline Inference ------------------ - -Offline inference allows for the same types of guided decoding. -To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. -The main available options inside ``GuidedDecodingParams`` are: - -- ``json`` -- ``regex`` -- ``choice`` -- ``grammar`` -- ``backend`` -- ``whitespace_pattern`` - -These parameters can be used in the same way as the parameters from the Online Inference examples above. -One example for the usage of the ``choices`` parameter is shown below: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.sampling_params import GuidedDecodingParams - - llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") - - guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) - sampling_params = SamplingParams(guided_decoding=guided_decoding_params) - outputs = llm.generate( - prompts="Classify this sentiment: vLLM is wonderful!", - sampling_params=sampling_params, - ) - print(outputs[0].outputs[0].text) - -A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md index a1e4b1c38acae..3d02fbab9216e 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/usage/usage_stats.md @@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette ## What data is collected? -You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py). +The list of data collected by the latest version of vLLM can be found here: Here is an example as of v0.4.0: @@ -47,7 +47,7 @@ tail ~/.config/vllm/usage_stats.json ## Opt-out of Usage Stats Collection -You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: +You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: ```bash # Any of the following methods can disable usage stats collection diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py index a06af8df5d3fe..365a684d53f2b 100644 --- a/examples/openai_cross_encoder_score.py +++ b/examples/openai_cross_encoder_score.py @@ -20,9 +20,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") + args = parser.parse_args() api_url = f"http://{args.host}:{args.port}/score" - model_name = args.model text_1 = "What is the capital of Brazil?" diff --git a/examples/openai_pooling_client.py b/examples/openai_pooling_client.py new file mode 100644 index 0000000000000..37ec8f2fb6be3 --- /dev/null +++ b/examples/openai_pooling_client.py @@ -0,0 +1,51 @@ +""" +Example online usage of Pooling API. + +Run `vllm serve --task ` +to start up the server in vLLM. +""" +import argparse +import pprint + +import requests + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", + type=str, + default="jason9693/Qwen2.5-1.5B-apeach") + + args = parser.parse_args() + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + # Input like Completions API + prompt = {"model": model_name, "input": "vLLM is great!"} + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + print("Pooling Response:") + pprint.pprint(pooling_response.json()) + + # Input like Chat API + prompt = { + "model": + model_name, + "messages": [{ + "role": "user", + "content": [{ + "type": "text", + "text": "vLLM is great!" + }], + }] + } + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + print("Pooling Response:") + pprint.pprint(pooling_response.json()) diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 058ab7c1ee9df..8002fbd8ee5b9 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for NVIDIA GPUs -ray >= 2.9 +ray[default] >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package torch == 2.5.1 # These must be updated alongside torch diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 95e5914757812..ac9d851d661b0 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -4,5 +4,5 @@ torch == 2.5.1 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention -optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version -optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version +optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version +optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version diff --git a/setup.py b/setup.py index fcfaa207c176a..61d2d710aa20e 100644 --- a/setup.py +++ b/setup.py @@ -455,9 +455,13 @@ def get_gaudi_sw_version(): def get_vllm_version() -> str: - version = get_version( - write_to="vllm/_version.py", # TODO: move this to pyproject.toml - ) + # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 + try: + version = get_version( + write_to="vllm/_version.py", # TODO: move this to pyproject.toml + ) + except LookupError: + version = "0.0.0" sep = "+" if "+" not in version else "." # dev versions might contain + @@ -466,7 +470,7 @@ def get_vllm_version() -> str: version += f"{sep}empty" elif _is_cuda(): if envs.VLLM_USE_PRECOMPILED: - version += ".precompiled" + version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) if cuda_version != MAIN_CUDA_VERSION: @@ -630,6 +634,7 @@ def _read_requirements(filename: str) -> List[str]: ext_modules=ext_modules, extras_require={ "tensorizer": ["tensorizer>=2.9.0"], + "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile"], # Required for audio processing "video": ["decord"] # Required for video processing }, diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 11d05cefb7313..1c2193bb17a55 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -127,11 +127,6 @@ def test_models_distributed( if attention_backend: os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend - # Import VLLM_USE_V1 dynamically to handle patching - from vllm.envs import VLLM_USE_V1 - if VLLM_USE_V1 and distributed_executor_backend != "mp": - pytest.skip(f"Skip {distributed_executor_backend} for V1") - dtype = "half" max_tokens = 5 diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index e9c48f2b6b551..ccb9906fc5c0f 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -174,11 +174,6 @@ def test_guided_choice_completion(sample_guided_choice, llm, @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) def test_guided_grammar(sample_sql_statements, llm, guided_decoding_backend: str): - if guided_decoding_backend == "outlines": - pytest.skip("Outlines backend fails in this test case with:\n" - "AttributeError: Error in model execution: 'ParserConf' " - "object has no attribute 'deterministic'") - sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1000, diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 0a29d77e73abc..1116c0da1a6f0 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 47c521a9b5eb5..5e6499d8f563c 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -17,6 +17,8 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] + @pytest.fixture(scope="module") def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 @@ -464,8 +466,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, # will fail on the second `guided_decoding_backend` even when I swap their order # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256) @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_guided_choice): @@ -506,8 +507,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_json_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema): @@ -554,8 +554,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_regex_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_regex): messages = [{ @@ -613,8 +612,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_guided_choice): @@ -646,8 +644,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_named_tool_use(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema): @@ -681,7 +678,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, "function": { "name": "dummy_function_name" } - }) + }, + extra_body=dict(guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message assert len(message.content) == 0 json_string = message.tool_calls[0].function.arguments @@ -716,6 +714,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, "name": "dummy_function_name" } }, + extra_body=dict(guided_decoding_backend=guided_decoding_backend), stream=True) output = [] @@ -738,10 +737,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) -async def test_required_tool_use_not_yet_supported( - client: openai.AsyncOpenAI, guided_decoding_backend: str, - sample_json_schema): +async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI, + sample_json_schema): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -785,9 +782,7 @@ async def test_required_tool_use_not_yet_supported( @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, - guided_decoding_backend: str, sample_json_schema): messages = [{ "role": "system", diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 9f2b77dde2a7f..b52a5b28c9cff 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -6,6 +6,7 @@ import pytest_asyncio import requests +from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer @@ -17,6 +18,8 @@ @pytest.fixture(scope="module") def server(): args = [ + "--task", + "embed", # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", @@ -45,11 +48,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ] # test single embedding - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + assert embeddings.id is not None assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 4096 @@ -59,11 +65,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): # test using token IDs input_tokens = [1, 1, 1, 1, 1] - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + assert embeddings.id is not None assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 4096 @@ -80,11 +89,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + assert embeddings.id is not None assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 @@ -95,11 +107,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): # test List[List[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + assert embeddings.id is not None assert len(embeddings.data) == 4 assert len(embeddings.data[0].embedding) == 4096 @@ -124,14 +139,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, "content": "Stars twinkle brightly in the night sky.", }] - chat_response = requests.post(server.url_for("v1/embeddings"), - json={ - "model": model_name, - "messages": messages, - "encoding_format": "float", - }) + chat_response = requests.post( + server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }, + ) chat_response.raise_for_status() - chat_embeddings = chat_response.json() + chat_embeddings = EmbeddingResponse.model_validate(chat_response.json()) tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") prompt = tokenizer.apply_chat_template( @@ -148,13 +165,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, # To be consistent with chat extra_body={"add_special_tokens": False}, ) - completion_embeddings = completion_response.model_dump(mode="json") + completion_embeddings = EmbeddingResponse.model_validate( + completion_response.model_dump(mode="json")) - assert chat_embeddings.pop("id") is not None - assert completion_embeddings.pop("id") is not None - assert chat_embeddings.pop("created") <= completion_embeddings.pop( - "created") - assert chat_embeddings == completion_embeddings + assert chat_embeddings.id is not None + assert completion_embeddings.id is not None + assert chat_embeddings.created <= completion_embeddings.created + assert chat_embeddings.model_dump( + exclude={"id", "created"}) == (completion_embeddings.model_dump( + exclude={"id", "created"})) @pytest.mark.asyncio @@ -204,10 +223,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, ] # test single embedding - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + assert embeddings.id is not None assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 4096 @@ -219,10 +241,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] - embeddings = await client.embeddings.create( + embedding_response = await client.embeddings.create( model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) assert embeddings.id is not None assert len(embeddings.data) == 1 @@ -241,10 +265,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, ] with pytest.raises(openai.BadRequestError): - embeddings = await client.embeddings.create( + response = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 8193}) - assert "error" in embeddings.object + assert "error" in response.object assert "truncate_prompt_tokens value is greater than max_model_len. "\ - "Please, select a smaller truncation size." in embeddings.message + "Please, select a smaller truncation size." in response.message diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py new file mode 100644 index 0000000000000..9c49239398cd2 --- /dev/null +++ b/tests/entrypoints/openai/test_pooling.py @@ -0,0 +1,238 @@ +import base64 + +import numpy as np +import pytest +import requests + +from vllm.entrypoints.openai.protocol import PoolingResponse +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "classify", + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--chat-template", + DUMMY_CHAT_TEMPLATE, + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_pooling(server: RemoteOpenAIServer, model_name: str): + input_texts = [ + "The chef prepared a delicious meal.", + ] + + # test single pooling + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "float" + }, + ) + response.raise_for_status() + poolings = PoolingResponse.model_validate(response.json()) + + assert poolings.id is not None + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 2 + assert poolings.usage.completion_tokens == 0 + assert poolings.usage.prompt_tokens == 7 + assert poolings.usage.total_tokens == 7 + + # test using token IDs + input_tokens = [1, 1, 1, 1, 1] + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_tokens, + "encoding_format": "float" + }, + ) + response.raise_for_status() + poolings = PoolingResponse.model_validate(response.json()) + + assert poolings.id is not None + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 2 + assert poolings.usage.completion_tokens == 0 + assert poolings.usage.prompt_tokens == 5 + assert poolings.usage.total_tokens == 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): + # test List[str] + input_texts = [ + "The cat sat on the mat.", "A feline was resting on a rug.", + "Stars twinkle brightly in the night sky." + ] + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "float" + }, + ) + response.raise_for_status() + poolings = PoolingResponse.model_validate(response.json()) + + assert poolings.id is not None + assert len(poolings.data) == 3 + assert len(poolings.data[0].data) == 2 + assert poolings.usage.completion_tokens == 0 + assert poolings.usage.prompt_tokens == 25 + assert poolings.usage.total_tokens == 25 + + # test List[List[int]] + input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], + [25, 32, 64, 77]] + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_tokens, + "encoding_format": "float" + }, + ) + response.raise_for_status() + poolings = PoolingResponse.model_validate(response.json()) + + assert poolings.id is not None + assert len(poolings.data) == 4 + assert len(poolings.data[0].data) == 2 + assert poolings.usage.completion_tokens == 0 + assert poolings.usage.prompt_tokens == 17 + assert poolings.usage.total_tokens == 17 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_pooling(server: RemoteOpenAIServer, + model_name: str): + messages = [{ + "role": "user", + "content": "The cat sat on the mat.", + }, { + "role": "assistant", + "content": "A feline was resting on a rug.", + }, { + "role": "user", + "content": "Stars twinkle brightly in the night sky.", + }] + + chat_response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }, + ) + chat_response.raise_for_status() + chat_poolings = PoolingResponse.model_validate(chat_response.json()) + + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + prompt = tokenizer.apply_chat_template( + messages, + chat_template=DUMMY_CHAT_TEMPLATE, + add_generation_prompt=True, + continue_final_message=False, + tokenize=False, + ) + completions_response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": prompt, + "encoding_format": "float", + # To be consistent with chat + "add_special_tokens": False, + }, + ) + completions_response.raise_for_status() + completion_poolings = PoolingResponse.model_validate( + completions_response.json()) + + assert chat_poolings.id is not None + assert completion_poolings.id is not None + assert chat_poolings.created <= completion_poolings.created + assert chat_poolings.model_dump( + exclude={"id", "created"}) == (completion_poolings.model_dump( + exclude={"id", "created"})) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_pooling(server: RemoteOpenAIServer, + model_name: str): + input_texts = [ + "Hello my name is", + "The best thing about vLLM is that it supports many different models" + ] + + float_response = requests.post( + server.url_for("pooling"), + json={ + "input": input_texts, + "model": model_name, + "encoding_format": "float", + }, + ) + float_response.raise_for_status() + responses_float = PoolingResponse.model_validate(float_response.json()) + + base64_response = requests.post( + server.url_for("pooling"), + json={ + "input": input_texts, + "model": model_name, + "encoding_format": "base64", + }, + ) + base64_response.raise_for_status() + responses_base64 = PoolingResponse.model_validate(base64_response.json()) + + decoded_responses_base64_data = [] + for data in responses_base64.data: + decoded_responses_base64_data.append( + np.frombuffer(base64.b64decode(data.data), + dtype="float32").tolist()) + + assert responses_float.data[0].data == decoded_responses_base64_data[0] + assert responses_float.data[1].data == decoded_responses_base64_data[1] + + # Default response is float32 decoded from base64 by OpenAI Client + default_response = requests.post( + server.url_for("pooling"), + json={ + "input": input_texts, + "model": model_name, + }, + ) + default_response.raise_for_status() + responses_default = PoolingResponse.model_validate(default_response.json()) + + assert responses_float.data[0].data == responses_default.data[0].data + assert responses_float.data[1].data == responses_default.data[1].data diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 294b250362699..e73449e406739 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index a0b6edd566561..5f070ba3b12e9 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 43c63daacb17f..3731b2dcdeae1 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,9 +1,9 @@ from typing import Dict import pytest -import pytest_asyncio import requests +from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.multimodal.utils import encode_image_base64, fetch_image from ...utils import VLLM_PATH, RemoteOpenAIServer @@ -46,12 +46,6 @@ def server(): yield remote_server -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - @pytest.fixture(scope="session") def base64_encoded_image() -> Dict[str, str]: return { @@ -82,18 +76,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, ], }] - response = requests.post(server.url_for("v1/embeddings"), - json={ - "model": model_name, - "messages": messages, - "encoding_format": "float" - }) + response = requests.post( + server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }, + ) response.raise_for_status() - - embeddings = response.json() - assert embeddings["id"] is not None - assert len(embeddings["data"]) == 1 - assert len(embeddings["data"][0]["embedding"]) == 3072 - assert embeddings["usage"]["completion_tokens"] == 0 - assert embeddings["usage"]["prompt_tokens"] == 765 - assert embeddings["usage"]["total_tokens"] == 765 + embeddings = EmbeddingResponse.model_validate(response.json()) + + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 3072 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 765 + assert embeddings.usage.total_tokens == 765 diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 96b0e58713332..718730bb8cbbe 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("Test run passed!") + print("My rank: %d, Test run passed!" % (my_rank)) def stress_test(my_rank, buf, device): @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("Passed stress test!") + print("My rank: %d, Passed stress test!" % (my_rank)) if __name__ == "__main__": diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh index 09d7ee018c3f4..f2aeaee9ca6d5 100644 --- a/tests/kv_transfer/test_lookup_buffer.sh +++ b/tests/kv_transfer/test_lookup_buffer.sh @@ -1,3 +1,8 @@ #!/bin/bash -RANK=0 python test_lookup_buffer.py & -RANK=1 python test_lookup_buffer.py & \ No newline at end of file +RANK=0 python3 test_lookup_buffer.py & +PID0=$! +RANK=1 python3 test_lookup_buffer.py & +PID1=$! + +wait $PID0 +wait $PID1 diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 65973bf10a4d7..4beba4dc05dde 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -10,39 +10,42 @@ def test_run(my_rank, pipe): + print(f"rank {my_rank} test_run starts....") # test run x = torch.tensor([1]).to(pipe.device) y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device) if my_rank == 0: pipe.send_tensor(x) - print("sent tensor x") + print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) - print("sent tensor y") + print(f"rank {my_rank} sent tensor y") x2 = pipe.recv_tensor() - print("received x2 = ", x2) + print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print("received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", x2) else: x2 = pipe.recv_tensor() - print("received x2 = ", x2) + print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print("received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", x2) pipe.send_tensor(x) - print("sent tensor x") + print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) - print("sent tensor y") + print(f"rank {my_rank} sent tensor y") assert torch.allclose(x, x2) assert torch.allclose(y, y2) + print(f"rank {my_rank} test_run passed!") -def stress_test(my_rank, pipe): - torch.distributed.barrier() +def stress_test(my_rank, pipe): + print(f"rank {my_rank} stress_test starts....") tensors: List[torch.Tensor] = [] + torch.distributed.barrier() torch.manual_seed(0) for i in tqdm(range(500)): @@ -86,7 +89,6 @@ def stress_test(my_rank, pipe): def latency_test(my_rank, pipe, nelement, ntensor): - latencies = [] torch.distributed.barrier() @@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor): ) test_run(my_rank, pipe) + stress_test(my_rank, pipe) # Use this function if you want to test the latency of pipe impl. diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh index 1e89e246b4992..54e0604806841 100644 --- a/tests/kv_transfer/test_send_recv.sh +++ b/tests/kv_transfer/test_send_recv.sh @@ -1,3 +1,9 @@ #!/bin/bash + RANK=0 python3 test_send_recv.py & -RANK=1 python3 test_send_recv.py & \ No newline at end of file +PID0=$! +RANK=1 python3 test_send_recv.py & +PID1=$! + +wait $PID0 +wait $PID1 diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 29ecf37808205..8b247fb9b2388 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -200,6 +200,11 @@ def minicpmv_lora_files(): return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon") +@pytest.fixture(scope="session") +def qwen2vl_lora_files(): + return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon") + + @pytest.fixture(scope="session") def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 9a529e27b4cd8..537d95b025a9d 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -4,6 +4,7 @@ from vllm.lora.models import LoRAModel from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM +from vllm.model_executor.models.utils import WeightsMapper lora_lst = [ "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b" @@ -71,3 +72,37 @@ def test_load_checkpoints( device="cpu", embedding_modules=embedding_modules, embedding_padding_modules=embed_padding_modules) + + +def test_lora_weights_mapping(baichuan_lora_files): + supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules + packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping + embedding_modules = BaiChuanBaseForCausalLM.embedding_modules + embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules + expected_lora_modules: List[str] = [] + for module in supported_lora_modules: + if module in packed_modules_mapping: + expected_lora_modules.extend(packed_modules_mapping[module]) + else: + expected_lora_modules.append(module) + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.": "language_model.model.", + }, + orig_to_new_substr={ + ".layers.": ".baichuan_layers.", + }, + ) + lora_model = LoRAModel.from_local_checkpoint( + baichuan_lora_files, + expected_lora_modules, + lora_model_id=1, + device="cpu", + embedding_modules=embedding_modules, + embedding_padding_modules=embed_padding_modules, + weights_mapper=hf_to_vllm_mapper, + ) + for name in lora_model.loras: + assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."]) + assert ".baichuan_layers." in name diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 150221dfce6ab..797a495201d33 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): @pytest.mark.parametrize("tp_size", [4]) +@pytest.mark.parametrize("fully_shard", [True, False]) def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, - tp_size): + tp_size, fully_shard): """This LoRA model has all supported Mixtral target modules""" if torch.cuda.device_count() < tp_size: @@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, max_loras=4, distributed_executor_backend="ray", tensor_parallel_size=tp_size, + fully_sharded_loras=fully_shard, max_lora_rank=32, ) diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py new file mode 100644 index 0000000000000..c9f48402b0268 --- /dev/null +++ b/tests/lora/test_qwen2vl.py @@ -0,0 +1,82 @@ +from typing import List + +import pytest + +import vllm +from vllm.assets.image import ImageAsset +from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform + +MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" + +PROMPT_TEMPLATE = ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" + "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + "What is in the image?<|im_end|>\n" + "<|im_start|>assistant\n") + +IMAGE_ASSETS = [ + ImageAsset("stop_sign"), + ImageAsset("cherry_blossom"), +] + +# After fine-tuning with LoRA, all generated content should start begin `A`. +EXPECTED_OUTPUT = [ + "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501 + "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 +] + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + sampling_params = vllm.SamplingParams( + temperature=0, + max_tokens=5, + ) + + inputs = [{ + "prompt": PROMPT_TEMPLATE, + "multi_modal_data": { + "image": asset.pil_image + }, + } for asset in IMAGE_ASSETS] + + outputs = llm.generate( + inputs, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None, + ) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +@pytest.mark.xfail(current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm" + ) +def test_qwen2vl_lora(qwen2vl_lora_files): + llm = vllm.LLM( + MODEL_PATH, + max_num_seqs=2, + enable_lora=True, + max_loras=2, + max_lora_rank=16, + trust_remote_code=True, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, + max_model_len=4096, + ) + output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output1[i]) + + output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output2[i]) diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 6e6e5b40d6a35..18ceb34a4e042 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index 45a7365204403..7406df253e7f0 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -8,7 +8,7 @@ # Import the functions to test from vllm.model_executor.models.h2ovl import (calculate_num_blocks, image_to_pixel_values_wrapper) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size models = [ "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 82eae0705c9ba..3a8934adfb076 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -5,7 +5,7 @@ import pytest from transformers import AutoTokenizer -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 71b6ba4dca435..51fe7d2ad32a8 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -6,8 +6,8 @@ from PIL import Image from vllm.entrypoints.llm import LLM -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import rescale_video_size, sample_frames_from_video from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 66668296139f5..59773be709fa8 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -5,8 +5,9 @@ import torch -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import _ImageAssets, _VideoAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index e698d8d3f6f56..2291f4fa0d0ac 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,8 +1,9 @@ """Custom input builders for edge-cases in different models.""" from typing import Callable -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6321503e7b248..6673a9fc22f69 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -1,7 +1,4 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This test only tests small models. Big models such as 7B should be tested from -test_big_models.py because it could use a larger instance to run tests. +"""Compare the classification outputs of HF and vLLM models. Run `pytest tests/models/test_cls_models.py`. """ diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index af31e1a635f65..be6e3842821e2 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -1,6 +1,6 @@ -"""Compare the embedding outputs of HF and vLLM models. +"""Compare the scoring outputs of HF and vLLM models. -Run `pytest tests/models/embedding/language/test_embedding.py`. +Run `pytest tests/models/embedding/language/test_scoring.py`. """ import math diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 77dd1d81f84d7..636a3eedff31b 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -6,7 +6,7 @@ from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index b5368aab3ecf1..73b70d65e8e0b 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -6,7 +6,9 @@ from vllm.model_executor.models import (is_pooling_model, is_text_generation_model, supports_multimodal) -from vllm.model_executor.models.adapters import as_embedding_model +from vllm.model_executor.models.adapters import (as_classification_model, + as_embedding_model, + as_reward_model) from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS, _SPECULATIVE_DECODING_MODELS, _TEXT_GENERATION_MODELS, @@ -29,9 +31,10 @@ def test_registry_imports(model_arch): or model_arch in _MULTIMODAL_MODELS): assert is_text_generation_model(model_cls) - # All vLLM models should be convertible to an embedding model - embed_model = as_embedding_model(model_cls) - assert is_pooling_model(embed_model) + # All vLLM models should be convertible to a pooling model + assert is_pooling_model(as_classification_model(model_cls)) + assert is_pooling_model(as_embedding_model(model_cls)) + assert is_pooling_model(as_reward_model(model_cls)) if model_arch in _MULTIMODAL_MODELS: assert supports_multimodal(model_cls) diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 71832acbd17b8..81f2a06182bcc 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -6,7 +6,7 @@ from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size @pytest.fixture diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index d676eacffb056..5e7d7d1877e61 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -13,6 +13,7 @@ class MyGemma2Embedding(nn.Module): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -62,8 +63,8 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - weights = hf_to_vllm_mapper.apply(weights) + + weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) return self.model.load_weights(weights) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 38e02f6018dee..92436889ecffe 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -79,12 +79,12 @@ def zp_valid(zp: Optional[torch.Tensor]): assert output -@pytest.mark.parametrize( - "model_path", - [ - "neuralmagic/Llama-3.2-1B-quantized.w8a8" - # TODO static & asymmetric - ]) +@pytest.mark.parametrize("model_path", [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" +]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [10]) def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner, @@ -92,6 +92,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner, max_tokens, num_logprobs): dtype = "bfloat16" + # skip language translation prompt for the static per tensor asym model + if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym": # noqa: E501 + example_prompts = example_prompts[0:-1] + with hf_runner(model_path, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py new file mode 100644 index 0000000000000..c5722fbae5c8a --- /dev/null +++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py @@ -0,0 +1,31 @@ +from vllm import SamplingParams +from vllm.config import LoadConfig, LoadFormat +from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader, + get_model_loader) + +test_model = "openai-community/gpt2" + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0) + + +def get_runai_model_loader(): + load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER) + return get_model_loader(load_config) + + +def test_get_model_loader_with_runai_flag(): + model_loader = get_runai_model_loader() + assert isinstance(model_loader, RunaiModelStreamerLoader) + + +def test_runai_model_loader_download_files(vllm_runner): + with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm: + deserialized_outputs = llm.generate(prompts, sampling_params) + assert deserialized_outputs diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py new file mode 100644 index 0000000000000..5c89bd78ad81d --- /dev/null +++ b/tests/runai_model_streamer/test_weight_utils.py @@ -0,0 +1,39 @@ +import glob +import tempfile + +import huggingface_hub.constants +import torch + +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf, runai_safetensors_weights_iterator, + safetensors_weights_iterator) + + +def test_runai_model_loader(): + with tempfile.TemporaryDirectory() as tmpdir: + huggingface_hub.constants.HF_HUB_OFFLINE = False + download_weights_from_hf("openai-community/gpt2", + allow_patterns=["*.safetensors"], + cache_dir=tmpdir) + safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True) + assert len(safetensors) > 0 + + runai_model_streamer_tensors = {} + hf_safetensors_tensors = {} + + for name, tensor in runai_safetensors_weights_iterator(safetensors): + runai_model_streamer_tensors[name] = tensor + + for name, tensor in safetensors_weights_iterator(safetensors): + hf_safetensors_tensors[name] = tensor + + assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors) + + for name, runai_tensor in runai_model_streamer_tensors.items(): + assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype + assert runai_tensor.shape == hf_safetensors_tensors[name].shape + assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name])) + + +if __name__ == "__main__": + test_runai_model_loader() diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 19f31b8ec419d..aeacf5dda5761 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,5 +1,4 @@ import contextlib -import functools import importlib from typing import TYPE_CHECKING, List, Optional, Tuple, Union @@ -36,34 +35,6 @@ def register_fake(fn): from torch.library import impl_abstract as register_fake -def hint_on_error(fn): - - @functools.wraps(fn) - def wrapper(*args, **kwargs): - try: - return fn(*args, **kwargs) - - except NotImplementedError as e: - msg = ( - "Error in calling custom op %s: %s\n" - "Not implemented or built, mostly likely because the current current device " - "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set " - "incorrectly while building)") - logger.error(msg, fn.__name__, e) - raise NotImplementedError(msg % (fn.__name__, e)) from e - except AttributeError as e: - msg = ( - "Error in calling custom op %s: %s\n" - "Possibly you have built or installed an obsolete version of vllm.\n" - "Please try a clean build and install of vllm," - "or remove old built files such as vllm/*cpython*.so and build/ ." - ) - logger.error(msg, fn.__name__, e) - raise e - - return wrapper - - # activation ops def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: torch.ops._C.silu_and_mul(out, x) @@ -1101,25 +1072,3 @@ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]: def register_graph_buffers(fa: int, handles: List[List[int]], offsets: List[List[int]]) -> None: torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) - - -# temporary fix for https://github.com/vllm-project/vllm/issues/5456 -# TODO: remove this in v0.6.0 -names_and_values = globals() -names_and_values_to_update = {} -# prepare variables to avoid dict size change during iteration -k, v, arg = None, None, None -fn_type = type(lambda x: x) -for k, v in names_and_values.items(): - # find functions that are defined in this file and have torch.Tensor - # in their annotations. `arg == "torch.Tensor"` is used to handle - # the case when users use `import __annotations__` to turn type - # hints into strings. - if isinstance(v, fn_type) \ - and v.__code__.co_filename == __file__ \ - and any(arg is torch.Tensor or arg == "torch.Tensor" - for arg in v.__annotations__.values()): - names_and_values_to_update[k] = hint_on_error(v) - -names_and_values.update(names_and_values_to_update) -del names_and_values_to_update, names_and_values, v, k, fn_type diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e4dcab10466db..e6779935bad17 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download from PIL import Image -from vllm.multimodal.utils import (sample_frames_from_video, +from vllm.multimodal.video import (sample_frames_from_video, try_import_video_packages) from .base import get_cache_dir diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 19daeb729ee61..480901f71047f 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 05d997279893b..69b6d1e4648df 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -191,6 +191,7 @@ def __init__( kv_cache_dtype=None, block_size=16, is_attention_free=False) + attn_backend = backend_name_to_enum(attn_backend.get_name()) if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: attn_backend = _Backend.XFORMERS diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0c7bbfe599b02..826d1744d88a5 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -141,14 +141,14 @@ def produce_guards_expression(self, *args, **kwargs): return "" -def wrap_inductor(graph, +def wrap_inductor(graph: fx.GraphModule, example_inputs, additional_inductor_config, compilation_config: CompilationConfig, graph_index: int = 0, num_graphs: int = 1, runtime_shape: Optional[int] = None, - use_inductor: bool = True): + use_inductor: bool = True) -> Any: if graph_index == 0: # before compiling the first graph, record the start time global compilation_start_time @@ -209,7 +209,7 @@ def wrap_inductor(graph, returns_tuple = graph_returns_tuple(graph) # this is the graph we return to Dynamo to run - def compiled_graph(*args): + def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]: # convert args to list list_args = list(args) graph_output = inductor_compiled_graph(list_args) @@ -247,7 +247,7 @@ def _check_can_cache(*args, **kwargs): # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa return - def _get_shape_env(): + def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() with patch(# for hijacking the hash of the compiled graph @@ -537,7 +537,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: example_inputs[x].clone() for x in self.sym_tensor_indices ] - def copy_and_call(*args): + def copy_and_call(*args) -> fx.GraphModule: list_args = list(args) for i, index in enumerate(self.sym_tensor_indices): runtime_tensor = list_args[index] diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index 0ad648abfbb3a..b6bcecdc89e26 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -7,6 +7,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor import pattern_matcher as pm from torch._ops import OpOverload +from torch.fx import Node from vllm.compilation.fx_utils import find_auto_fn @@ -97,7 +98,7 @@ def insert_getitems(self, tuple_node: fx.Node, self.graph.call_function(operator.getitem, (tuple_node, idx)) for idx in indices) - def insert_auto_fn(self, op: OpOverload, kwargs): + def insert_auto_fn(self, op: OpOverload, kwargs) -> Node: """ Insert an auto_functionalized node with the given op and kwargs. """ diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index fb522ae053e97..34f5f355798b2 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, Dict, List from torch import fx as fx @@ -53,7 +53,7 @@ def add(self, pass_: InductorPass): assert isinstance(pass_, InductorPass) self.passes.append(pass_) - def __getstate__(self): + def __getstate__(self) -> Dict[str, List[Any]]: """ Custom pickling for the pass manager, as some passes cannot be pickled. Pickling occurs because the pass manager is set as the value of diff --git a/vllm/config.py b/vllm/config.py index 6badae24d9d7d..17602bda15c69 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -29,6 +29,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, try_get_generation_config, uses_mrope) +from vllm.transformers_utils.utils import is_s3 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, get_cpu_memory, print_warning_once, random_uuid, resolve_obj_by_qualname) @@ -256,6 +257,8 @@ def __init__(self, f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) + self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) + # The tokenizer version is consistent with the model version by default. if tokenizer_revision is None: self.tokenizer_revision = revision @@ -357,6 +360,39 @@ def __init__(self, self._verify_cuda_graph() self._verify_bnb_config() + def maybe_pull_model_tokenizer_for_s3(self, model: str, + tokenizer: str) -> None: + """ + Pull the model config or tokenizer to a temporary + directory in case of S3. + + Args: + model: The model name or path. + tokenizer: The tokenizer name or path. + + """ + if is_s3(model) or is_s3(tokenizer): + try: + from vllm.transformers_utils.s3_utils import S3Model + except ImportError as err: + raise ImportError( + "Please install Run:ai optional dependency " + "to use the S3 capabilities. " + "You can install it with: pip install vllm[runai]" + ) from err + + if is_s3(model): + self.s3_model = S3Model() + self.s3_model.pull_files(model, allow_pattern=["*config.json"]) + self.model_weights = self.model + self.model = self.s3_model.dir + + if is_s3(tokenizer): + self.s3_tokenizer = S3Model() + self.s3_tokenizer.pull_files( + model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) + self.tokenizer = self.s3_tokenizer.dir + def _init_multimodal_config( self, limit_mm_per_prompt: Optional[Mapping[str, int]] ) -> Optional["MultiModalConfig"]: @@ -602,7 +638,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( @@ -622,7 +658,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -1099,6 +1135,7 @@ class LoadFormat(str, enum.Enum): GGUF = "gguf" BITSANDBYTES = "bitsandbytes" MISTRAL = "mistral" + RUNAI_STREAMER = "runai_streamer" @dataclass @@ -2021,7 +2058,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: logger.warning("LoRA with chunked prefill is still experimental " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 912a8b2f54adb..21966d003c7ef 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -316,6 +316,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '* "tensorizer" will load the weights using tensorizer from ' 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' + '* "runai_streamer" will load the Safetensors weights using Run:ai' + 'Model Streamer \n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') parser.add_argument( @@ -371,7 +373,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: choices=['outlines', 'lm-format-enforcer', 'xgrammar'], help='Which engine will be used for guided decoding' ' (JSON schema / regex etc) by default. Currently support ' - 'https://github.com/outlines-dev/outlines,' + 'https://github.com/outlines-dev/outlines, ' 'https://github.com/mlc-ai/xgrammar, and ' 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' @@ -1146,7 +1148,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index f50e20cf70323..66a5089074ff5 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1256,3 +1256,10 @@ async def stop_profile(self) -> None: self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") + + +# TODO(v1): Remove this class proxy when V1 goes default. +if envs.VLLM_USE_V1: + from vllm.v1.engine.async_llm import AsyncLLM + + AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e78b6f4d26758..39f59e55da1f7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -133,7 +133,7 @@ class LLMEngine: and the :class:`AsyncLLMEngine` class wraps this class for online serving. The config arguments are derived from :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index a9b638ed02a1e..1c6f735f39e04 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 94d4a4d89adc9..fadf297e9f6aa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ class LLM: integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Note: This class is intended to be used for offline inference. For online @@ -233,7 +233,8 @@ def __init__( self.request_counter = Counter() def __del__(self): - if self.llm_engine and hasattr(self.llm_engine, "shutdown"): + if hasattr(self, 'llm_engine') and self.llm_engine and hasattr( + self.llm_engine, "shutdown"): self.llm_engine.shutdown() @staticmethod diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 00e2d1a56f160..3e50613a73dd3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -27,6 +27,7 @@ import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient @@ -44,8 +45,11 @@ DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, + EmbeddingResponse, + EmbeddingResponseData, + ErrorResponse, LoadLoraAdapterRequest, + PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, @@ -55,6 +59,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) @@ -66,11 +71,6 @@ is_valid_ipv6_address) from vllm.version import __version__ as VLLM_VERSION -if envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLMEngine # type: ignore -else: - from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore - TIMEOUT_KEEP_ALIVE = 5 # seconds prometheus_multiproc_dir: tempfile.TemporaryDirectory @@ -288,6 +288,10 @@ def completion(request: Request) -> Optional[OpenAIServingCompletion]: return request.app.state.openai_serving_completion +def pooling(request: Request) -> Optional[OpenAIServingPooling]: + return request.app.state.openai_serving_pooling + + def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: return request.app.state.openai_serving_embedding @@ -399,10 +403,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request): async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") + fallback_handler = pooling(raw_request) + if fallback_handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") + + logger.warning( + "Embeddings API will become exclusive to embedding models " + "in a future release. To return the hidden states directly, " + "use the Pooling API (`/pooling`) instead.") + + res = await fallback_handler.create_pooling(request, raw_request) + if isinstance(res, PoolingResponse): + generator = EmbeddingResponse( + id=res.id, + object=res.object, + created=res.created, + model=res.model, + data=[ + EmbeddingResponseData( + index=d.index, + embedding=d.data, # type: ignore + ) for d in res.data + ], + usage=res.usage, + ) + else: + generator = res + else: + generator = await handler.create_embedding(request, raw_request) - generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -412,6 +442,24 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) +@router.post("/pooling") +@with_cancellation +async def create_pooling(request: PoolingRequest, raw_request: Request): + handler = pooling(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Pooling API") + + generator = await handler.create_pooling(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, PoolingResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + @router.post("/score") @with_cancellation async def create_score(request: ScoreRequest, raw_request: Request): @@ -609,7 +657,7 @@ def init_app_state( request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) if model_config.runner_type == "generate" else None - state.openai_serving_embedding = OpenAIServingEmbedding( + state.openai_serving_pooling = OpenAIServingPooling( engine_client, model_config, base_model_paths, @@ -617,13 +665,20 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) if model_config.runner_type == "pooling" else None + state.openai_serving_embedding = OpenAIServingEmbedding( + engine_client, + model_config, + base_model_paths, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + ) if model_config.task == "embed" else None state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, base_model_paths, request_logger=request_logger - ) if (model_config.runner_type == "pooling" \ - and model_config.is_cross_encoder) else None + ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 1314de714215e..14e41346df775 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -46,7 +46,15 @@ class OpenAIBaseModel(BaseModel): @classmethod def __log_extra_fields__(cls, data): if isinstance(data, dict): - extra_fields = data.keys() - cls.model_fields.keys() + # Get all class field names and their potential aliases + field_names = set() + for field_name, field in cls.model_fields.items(): + field_names.add(field_name) + if hasattr(field, 'alias') and field.alias: + field_names.add(field.alias) + + # Compare against both field names and aliases + extra_fields = data.keys() - field_names if extra_fields: logger.warning( "The following fields were present in the request " @@ -955,6 +963,10 @@ def to_pooling_params(self): EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] +PoolingCompletionRequest = EmbeddingCompletionRequest +PoolingChatRequest = EmbeddingChatRequest +PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest] + class ScoreRequest(OpenAIBaseModel): model: str @@ -1050,6 +1062,21 @@ class EmbeddingResponse(OpenAIBaseModel): usage: UsageInfo +class PoolingResponseData(OpenAIBaseModel): + index: int + object: str = "pooling" + data: Union[List[List[float]], List[float], str] + + +class PoolingResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"pool-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: List[PoolingResponseData] + usage: UsageInfo + + class ScoreResponseData(OpenAIBaseModel): index: int object: str = "score" diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 675daf54c0d0d..572ed27b39083 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -232,7 +232,7 @@ async def main(args): request_logger=request_logger, chat_template=None, chat_template_content_format="auto", - ) if model_config.runner_type == "pooling" else None + ) if model_config.task == "embed" else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 879276646d2ba..b8fb9d6bd77f2 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -40,36 +40,6 @@ def _get_embedding( assert_never(encoding_format) -def request_output_to_embedding_response( - final_res_batch: List[PoolingRequestOutput], request_id: str, - created_time: int, model_name: str, - encoding_format: Literal["float", "base64"]) -> EmbeddingResponse: - data: List[EmbeddingResponseData] = [] - num_prompt_tokens = 0 - for idx, final_res in enumerate(final_res_batch): - embedding_res = EmbeddingRequestOutput.from_base(final_res) - prompt_token_ids = final_res.prompt_token_ids - - embedding = _get_embedding(embedding_res.outputs, encoding_format) - embedding_data = EmbeddingResponseData(index=idx, embedding=embedding) - data.append(embedding_data) - - num_prompt_tokens += len(prompt_token_ids) - - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - total_tokens=num_prompt_tokens, - ) - - return EmbeddingResponse( - id=request_id, - created=created_time, - model=model_name, - data=data, - usage=usage, - ) - - class OpenAIServingEmbedding(OpenAIServing): def __init__( @@ -114,7 +84,7 @@ async def create_embedding( model_name = request.model request_id = f"embd-{self._base_request_id(raw_request)}" - created_time = int(time.monotonic()) + created_time = int(time.time()) truncate_prompt_tokens = None @@ -218,9 +188,13 @@ async def create_embedding( final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch) - response = request_output_to_embedding_response( - final_res_batch_checked, request_id, created_time, model_name, - encoding_format) + response = self.request_output_to_embedding_response( + final_res_batch_checked, + request_id, + created_time, + model_name, + encoding_format, + ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: @@ -228,3 +202,40 @@ async def create_embedding( return self.create_error_response(str(e)) return response + + def request_output_to_embedding_response( + self, + final_res_batch: List[PoolingRequestOutput], + request_id: str, + created_time: int, + model_name: str, + encoding_format: Literal["float", "base64"], + ) -> EmbeddingResponse: + items: List[EmbeddingResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch): + embedding_res = EmbeddingRequestOutput.from_base(final_res) + + item = EmbeddingResponseData( + index=idx, + embedding=_get_embedding(embedding_res.outputs, + encoding_format), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return EmbeddingResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py new file mode 100644 index 0000000000000..01852f0df1eca --- /dev/null +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -0,0 +1,234 @@ +import asyncio +import base64 +import time +from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast + +import numpy as np +from fastapi import Request +from typing_extensions import assert_never + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ErrorResponse, + PoolingChatRequest, + PoolingRequest, PoolingResponse, + PoolingResponseData, UsageInfo) +from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.logger import init_logger +from vllm.outputs import PoolingOutput, PoolingRequestOutput +from vllm.utils import merge_async_iterators + +logger = init_logger(__name__) + + +def _get_data( + output: PoolingOutput, + encoding_format: Literal["float", "base64"], +) -> Union[List[float], str]: + if encoding_format == "float": + return output.data.tolist() + elif encoding_format == "base64": + # Force to use float32 for base64 encoding + # to match the OpenAI python client behavior + pooling_bytes = np.array(output.data, dtype="float32").tobytes() + return base64.b64encode(pooling_bytes).decode("utf-8") + + assert_never(encoding_format) + + +class OpenAIServingPooling(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + request_logger: Optional[RequestLogger], + chat_template: Optional[str], + chat_template_content_format: ChatTemplateContentFormatOption, + ) -> None: + super().__init__(engine_client=engine_client, + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + request_logger=request_logger) + + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + + async def create_pooling( + self, + request: PoolingRequest, + raw_request: Optional[Request] = None, + ) -> Union[PoolingResponse, ErrorResponse]: + """ + See https://platform.openai.com/docs/api-reference/embeddings/create + for the API specification. This API mimics the OpenAI Embedding API. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + encoding_format = request.encoding_format + if request.dimensions is not None: + return self.create_error_response( + "dimensions is currently not supported") + + model_name = request.model + request_id = f"pool-{self._base_request_id(raw_request)}" + created_time = int(time.time()) + + truncate_prompt_tokens = None + + if request.truncate_prompt_tokens is not None: + if request.truncate_prompt_tokens <= self.max_model_len: + truncate_prompt_tokens = request.truncate_prompt_tokens + else: + return self.create_error_response( + "truncate_prompt_tokens value is " + "greater than max_model_len." + " Please, select a smaller truncation size.") + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for pooling models") + + if isinstance(request, PoolingChatRequest): + ( + _, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, + # In pooling requests, we are not generating tokens, + # so there is no need to append extra tokens to the input + add_generation_prompt=False, + continue_final_message=False, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + else: + (request_prompts, + engine_prompts) = await self._preprocess_completion( + request, + tokenizer, + request.input, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + try: + pooling_params = request.to_pooling_params() + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs(request_id_item, + request_prompts[i], + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + result_generator = merge_async_iterators(*generators) + + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch = [None] * num_prompts + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch) + + response = self.request_output_to_pooling_response( + final_res_batch_checked, + request_id, + created_time, + model_name, + encoding_format, + ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response + + def request_output_to_pooling_response( + self, + final_res_batch: List[PoolingRequestOutput], + request_id: str, + created_time: int, + model_name: str, + encoding_format: Literal["float", "base64"], + ) -> PoolingResponse: + items: List[PoolingResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch): + item = PoolingResponseData( + index=idx, + data=_get_data(final_res.outputs, encoding_format), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return PoolingResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 101d170bee4d6..a8a126e697641 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -20,32 +20,6 @@ logger = init_logger(__name__) -def request_output_to_score_response( - final_res_batch: List[PoolingRequestOutput], request_id: str, - created_time: int, model_name: str) -> ScoreResponse: - data: List[ScoreResponseData] = [] - num_prompt_tokens = 0 - for idx, final_res in enumerate(final_res_batch): - classify_res = ScoringRequestOutput.from_base(final_res) - - score_data = ScoreResponseData(index=idx, - score=classify_res.outputs.score) - data.append(score_data) - - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - total_tokens=num_prompt_tokens, - ) - - return ScoreResponse( - id=request_id, - created=created_time, - model=model_name, - data=data, - usage=usage, - ) - - def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str], str]) -> List: if isinstance(text_1, (str, dict)): @@ -103,7 +77,7 @@ async def create_score( model_name = request.model request_id = f"score-{self._base_request_id(raw_request)}" - created_time = int(time.monotonic()) + created_time = int(time.time()) truncate_prompt_tokens = request.truncate_prompt_tokens request_prompts = [] @@ -203,8 +177,12 @@ async def create_score( final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch) - response = request_output_to_score_response( - final_res_batch_checked, request_id, created_time, model_name) + response = self.request_output_to_score_response( + final_res_batch_checked, + request_id, + created_time, + model_name, + ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: @@ -212,3 +190,38 @@ async def create_score( return self.create_error_response(str(e)) return response + + def request_output_to_score_response( + self, + final_res_batch: List[PoolingRequestOutput], + request_id: str, + created_time: int, + model_name: str, + ) -> ScoreResponse: + items: List[ScoreResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + item = ScoreResponseData( + index=idx, + score=classify_res.outputs.score, + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return ScoreResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 2816b5c5c1f88..5495bc50ede83 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index d4402e77a3886..aaeecab7ffde1 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index fb02627eb22bd..f3ec9d115c9ba 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -314,7 +314,7 @@ def dummy_data_for_profiling( The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` Note: This should be called after @@ -391,7 +391,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ def wrapper(model_cls: N) -> N: @@ -435,7 +435,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a6c93a3d8bfe9..85164c2165a3c 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -425,8 +425,9 @@ def forward(self, input_): if self.base_layer.skip_bias_add else None) return output, output_bias + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. @classmethod - @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 70806a77b9fff..5c0e4e5cbc636 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -28,7 +28,7 @@ parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.models.utils import PPMissingLayer +from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -113,13 +113,14 @@ def from_lora_tensors( target_embedding_padding: Optional[int] = None, embedding_modules: Optional[Dict[str, str]] = None, embedding_padding_modules: Optional[List[str]] = None, + weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a dictionary of tensors.""" pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: Dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( - tensor_name) + tensor_name, weights_mapper) if module_name not in loras: lora_embeddings_tensor = None if embeddings: @@ -187,6 +188,7 @@ def from_local_checkpoint( target_embedding_padding: Optional[int] = None, embedding_modules: Optional[Dict[str, str]] = None, embedding_padding_modules: Optional[List[str]] = None, + weights_mapper: Optional[WeightsMapper] = None, ) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. @@ -229,7 +231,8 @@ def from_local_checkpoint( with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore for lora_module in f.keys(): # noqa - module_name, _, _ = parse_fine_tuned_lora_name(lora_module) + module_name, _, _ = parse_fine_tuned_lora_name( + lora_module, weights_mapper) part_name = module_name.split(".")[-1] if part_name not in expected_lora_modules: unexpected_modules.append(module_name) @@ -289,7 +292,8 @@ def from_local_checkpoint( embeddings=embeddings, target_embedding_padding=target_embedding_padding, embedding_modules=embedding_modules, - embedding_padding_modules=embedding_padding_modules) + embedding_padding_modules=embedding_padding_modules, + weights_mapper=weights_mapper) class LoRAModelManager(AdapterModelManager): diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 5876494ce2824..d72b7638d84af 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -30,6 +30,7 @@ # yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.utils import WeightsMapper logger = init_logger(__name__) @@ -91,28 +92,46 @@ def replace_submodule(model: nn.Module, module_name: str, return new_module -def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]: +def parse_fine_tuned_lora_name( + name: str, + weights_mapper: Optional[WeightsMapper] = None +) -> Tuple[str, bool, bool]: """Parse the name of lora weights. args: name: the name of the fine-tuned LoRA, e.g. base_model.model.dense1.weight + weights_mapper: maps the name of weight, e.g. + `model.` -> `language_model.model.`, return: Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. is_bias whether the tensor is lora bias. """ + + # LoRA weight qualified name always starts with `base_model.model.`, + # so we remove the prefix `base_model.model.` to make the following + # mapping correctly. + if "base_model.model." in name: + name = name.replace("base_model.model.", "") + name = weights_mapper._map_name(name) if weights_mapper else name + # recover the prefix `base_model.model.` + name = "base_model.model." + name + parts = name.split(".") if parts[-1] == "weight" and (parts[-2] == "lora_A" or parts[-2] == "lora_B"): - return ".".join(parts[2:-2]), parts[-2] == "lora_A", False + new_name = ".".join(parts[2:-2]) + return new_name, parts[-2] == "lora_A", False if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": - return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False + new_name = ".".join(parts[2:-1]) + return new_name, parts[-1] == "lora_embedding_A", False if parts[-1] == "bias": - return ".".join(parts[2:-2]), False, True + new_name = ".".join(parts[2:-2]) + return new_name, False, True raise ValueError(f"{name} is unsupported LoRA weight") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 93a5e27621912..10976fac23028 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -91,7 +91,17 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: packed_modules_mapping[module]) else: expected_lora_modules.append(module) + + expected_lora_modules = list(set(expected_lora_modules)) lora_path = get_adapter_absolute_path(lora_request.lora_path) + + # For some models like Qwen2VL, we need to use hf_to_vllm_mapper + # to ensure correct loading of lora weights. + hf_to_vllm_mapper = None + if (hasattr(model, "hf_to_vllm_mapper") + and model.hf_to_vllm_mapper is not None): + hf_to_vllm_mapper = model.hf_to_vllm_mapper + lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, @@ -103,7 +113,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: self.lora_config.lora_extra_vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, - ) + weights_mapper=hf_to_vllm_mapper) + except Exception as e: raise RuntimeError(f"Loading lora {lora_path} failed") from e if lora.rank > self.lora_config.max_lora_rank: diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 550b892303feb..694c5b68b1cbd 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -3,6 +3,9 @@ from typing import TYPE_CHECKING from vllm.logger import init_logger +from vllm.model_executor.guided_decoding.utils import ( + convert_lark_to_gbnf, grammar_is_likely_lark, + has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features) from vllm.platforms import CpuArchEnum, current_platform if TYPE_CHECKING: @@ -15,76 +18,6 @@ logger = init_logger(__name__) -def has_xgrammar_unsupported_json_features(schema: dict) -> bool: - """Check if JSON schema contains features unsupported by xgrammar.""" - - def check_object(obj: dict) -> bool: - if not isinstance(obj, dict): - return False - - # Check for pattern restrictions - if "pattern" in obj: - return True - - # Check for numeric ranges - if obj.get("type") in ("integer", "number") and any( - key in obj for key in [ - "minimum", "maximum", "exclusiveMinimum", - "exclusiveMaximum", "multipleOf" - ]): - return True - - # Recursively check all nested objects and arrays - for value in obj.values(): - if isinstance(value, dict): - if check_object(value): - return True - elif isinstance(value, list): - for item in value: - if isinstance(item, dict) and check_object(item): - return True - - return False - - return check_object(schema) - - -def has_lmf_unsupported_json_features(schema: dict) -> bool: - """ - Check if JSON schema contains features unsupported - by lm_format_enforcer. - - Known issues: - - Regex patterns: - "grade": { - "type": "string", - "pattern": "^[A-D]$" # Regex pattern - }, - """ - - def check_object(obj: dict) -> bool: - if not isinstance(obj, dict): - return False - - # Check for pattern restrictions - if "pattern" in obj: - return True - - # Recursively check all nested objects and arrays - for value in obj.values(): - if isinstance(value, dict): - if check_object(value): - return True - elif isinstance(value, list): - for item in value: - if isinstance(item, dict) and check_object(item): - return True - - return False - - return check_object(schema) - - def maybe_backend_fallback( guided_params: GuidedDecodingParams) -> GuidedDecodingParams: # lm-format-enforce doesn't support grammar, fallback to xgrammar @@ -127,6 +60,20 @@ def maybe_backend_fallback( "Falling back to use outlines instead.") guided_params.backend = "outlines" + # xgrammar only supports GBNF grammars, so we must convert Lark. + # We must check if the grammar is likely Lark and if that + # grammar is convertible to GBNF + elif (guided_params.grammar is not None + and grammar_is_likely_lark(guided_params.grammar)): + try: + convert_lark_to_gbnf(guided_params.grammar) + except Exception: + logger.warning( + "xgrammar does not support Lark grammars and the " + "grammar failed to convert to GBNF. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + if (guided_params.backend == "outlines" and guided_params.json_object is not None): # outlines doesn't support json_object, fallback to xgrammar diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index b63fed1c8a8c3..e4eb3f16e56cf 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -21,10 +21,11 @@ import numpy as np import torch -from lark import Lark from outlines import grammars from outlines.caching import cache -from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write +from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide, + RegexGuide, Write) +from outlines.fsm.parsing import PartialLark from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase @@ -34,7 +35,9 @@ class BaseLogitsProcessor: def __init__(self, guide: Guide): self._guide: Guide = guide - self._fsm_state: DefaultDict[int, int] = defaultdict(int) + # CFGState is used for the FSM state for CFGGuide + self._fsm_state: DefaultDict[int, Union[int, + CFGState]] = defaultdict(int) def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: @@ -54,15 +57,13 @@ def __call__(self, input_ids: List[int], # On the first time this is called, we simply re-create # the Lark object. if isinstance(self._guide, CFGGuide): - self._guide.parser = Lark( + self._guide.parser = PartialLark( self._guide.cfg_string, parser="lalr", - lexer="contextual", - propagate_positions=False, - maybe_placeholders=False, - regex=True, import_paths=[grammars.GRAMMAR_PATH], ) + self._fsm_state[seq_id] = CFGState( + parser_state=self._guide.parser.parse(""), prev_token=None) instruction = self._guide.get_next_instruction( state=self._fsm_state[seq_id]) @@ -200,7 +201,8 @@ def convert_token_to_string(token: str) -> str: string = tokenizer.convert_tokens_to_string([token]) # A hack to handle missing spaces to HF's Llama tokenizers - if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>": + if (type(token) is str and token.startswith(SPIECE_UNDERLINE) + or token == "<0x20>"): return " " + string return string @@ -211,6 +213,9 @@ def change_decoder( """Sync vLLM's decoder with the outlines by returning list.""" def new_decoder(inp_tokens: List[int]) -> List[str]: + if (isinstance(inp_tokens, list) and len(inp_tokens) == 1 + and isinstance(inp_tokens[0], list)): + inp_tokens = inp_tokens[0] return [decoder(inp_tokens)] return new_decoder diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/utils.py similarity index 72% rename from vllm/model_executor/guided_decoding/xgrammar_utils.py rename to vllm/model_executor/guided_decoding/utils.py index 9a0463964de49..20abaefbacc51 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,6 +1,76 @@ import re +def has_xgrammar_unsupported_json_features(schema: dict) -> bool: + """Check if JSON schema contains features unsupported by xgrammar.""" + + def check_object(obj: dict) -> bool: + if not isinstance(obj, dict): + return False + + # Check for pattern restrictions + if "pattern" in obj: + return True + + # Check for numeric ranges + if obj.get("type") in ("integer", "number") and any( + key in obj for key in [ + "minimum", "maximum", "exclusiveMinimum", + "exclusiveMaximum", "multipleOf" + ]): + return True + + # Recursively check all nested objects and arrays + for value in obj.values(): + if isinstance(value, dict): + if check_object(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and check_object(item): + return True + + return False + + return check_object(schema) + + +def has_lmf_unsupported_json_features(schema: dict) -> bool: + """ + Check if JSON schema contains features unsupported + by lm_format_enforcer. + + Known issues: + - Regex patterns: + "grade": { + "type": "string", + "pattern": "^[A-D]$" # Regex pattern + }, + """ + + def check_object(obj: dict) -> bool: + if not isinstance(obj, dict): + return False + + # Check for pattern restrictions + if "pattern" in obj: + return True + + # Recursively check all nested objects and arrays + for value in obj.values(): + if isinstance(value, dict): + if check_object(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and check_object(item): + return True + + return False + + return check_object(schema) + + def grammar_is_likely_lark(grammar_str: str) -> bool: """ Check if grammar appears to use Lark syntax. diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 5b97f03257502..5e1948977bff4 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -14,8 +14,8 @@ except ImportError: pass -from vllm.model_executor.guided_decoding.xgrammar_utils import ( - convert_lark_to_gbnf, grammar_is_likely_lark) +from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf, + grammar_is_likely_lark) from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer if TYPE_CHECKING: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 9ad61a64e406c..61d1c911cd1ad 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -61,6 +61,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): + assert params_dtype == torch.float16, ( + "float16 is required for marlin24 compressd models. Set dtype=torch.float16" # noqa: E501 + ) + pack_factor = 32 // self.quant_type.size_bits output_size_per_partition = sum(output_partition_sizes) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index a74eaef5efdee..dfae4db71e546 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -30,7 +30,7 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in FUSED_LAYER_NAME_MAPPING: + if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore: shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] # Convert fused_name --> [shard_names] diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index fdc4c6305bd5e..24e554e6060ab 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -45,9 +45,10 @@ filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, get_gguf_extra_tensor_names, gguf_quant_weights_iterator, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, - safetensors_weights_iterator) + runai_safetensors_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.transformers_utils.utils import is_s3 from vllm.utils import is_pin_memory_available @@ -1234,6 +1235,118 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: return model +class RunaiModelStreamerLoader(BaseModelLoader): + """ + Model loader that can load safetensors + files from local FS or S3 bucket. + """ + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + if load_config.model_loader_extra_config: + extra_config = load_config.model_loader_extra_config + + if ("concurrency" in extra_config + and isinstance(extra_config.get("concurrency"), int)): + os.environ["RUNAI_STREAMER_CONCURRENCY"] = str( + extra_config.get("concurrency")) + + if ("memory_limit" in extra_config + and isinstance(extra_config.get("memory_limit"), int)): + os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str( + extra_config.get("memory_limit")) + + runai_streamer_s3_endpoint = os.getenv( + 'RUNAI_STREAMER_S3_ENDPOINT') + aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL') + if (runai_streamer_s3_endpoint is None + and aws_endpoint_url is not None): + os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url + + def _prepare_weights(self, model_name_or_path: str, + revision: Optional[str]) -> List[str]: + """Prepare weights for the model. + + If the model is not local, it will be downloaded.""" + is_s3_path = is_s3(model_name_or_path) + if is_s3_path: + try: + from vllm.transformers_utils.s3_utils import glob as s3_glob + except ImportError as err: + raise ImportError( + "Please install Run:ai optional dependency " + "to use the S3 capabilities. " + "You can install it with: pip install vllm[runai]" + ) from err + + is_local = os.path.isdir(model_name_or_path) + safetensors_pattern = "*.safetensors" + index_file = SAFE_WEIGHTS_INDEX_NAME + + hf_folder = (model_name_or_path if + (is_local or is_s3_path) else download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + [safetensors_pattern], + revision, + ignore_patterns=self.load_config.ignore_patterns, + )) + + if is_s3_path: + hf_weights_files = s3_glob(path=hf_folder, + allow_pattern=[safetensors_pattern]) + else: + hf_weights_files = glob.glob( + os.path.join(hf_folder, safetensors_pattern)) + + if not is_local and not is_s3_path: + download_safetensors_index_file_from_hf( + model_name_or_path, index_file, self.load_config.download_dir, + revision) + + if not hf_weights_files: + raise RuntimeError( + f"Cannot find any safetensors model weights with " + f"`{model_name_or_path}`") + + return hf_weights_files + + def _get_weights_iterator( + self, model_or_path: str, + revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]: + """Get an iterator for the model weights based on the load format.""" + hf_weights_files = self._prepare_weights(model_or_path, revision) + return runai_safetensors_weights_iterator(hf_weights_files) + + def download_model(self, model_config: ModelConfig) -> None: + """Download model if necessary""" + self._prepare_weights(model_config.model, model_config.revision) + + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + """Perform streaming of the model to destination""" + device_config = vllm_config.device_config + model_config = vllm_config.model_config + + target_device = torch.device(device_config.device) + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = _initialize_model(vllm_config=vllm_config) + + model_weights = model_config.model + if hasattr(model_config, "model_weights"): + model_weights = model_config.model_weights + model.load_weights( + self._get_weights_iterator(model_weights, + model_config.revision)) + + for _, module in model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if quant_method is not None: + with device_loading_context(module, target_device): + quant_method.process_weights_after_loading(module) + return model.eval() + + def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: """Get a model loader based on the load format.""" @@ -1255,4 +1368,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: if load_config.load_format == LoadFormat.GGUF: return GGUFModelLoader(load_config) + if load_config.load_format == LoadFormat.RUNAI_STREAMER: + return RunaiModelStreamerLoader(load_config) + return DefaultModelLoader(load_config) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index f15e7176b3d50..44978a55e072d 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -7,7 +7,9 @@ from vllm.config import ModelConfig from vllm.model_executor.models import ModelRegistry -from vllm.model_executor.models.adapters import as_embedding_model +from vllm.model_executor.models.adapters import (as_classification_model, + as_embedding_model, + as_reward_model) @contextlib.contextmanager @@ -35,8 +37,12 @@ def get_model_architecture( architectures = ["QuantMixtralForCausalLM"] model_cls, arch = ModelRegistry.resolve_model_cls(architectures) - if model_config.runner_type == "pooling": + if model_config.task == "embed": model_cls = as_embedding_model(model_cls) + elif model_config.task == "classify": + model_cls = as_classification_model(model_cls) + elif model_config.task == "reward": + model_cls = as_reward_model(model_cls) return model_cls, arch diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 9488d54edf365..f2a9e7e2687cb 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -410,6 +410,30 @@ def safetensors_weights_iterator( yield name, param +def runai_safetensors_weights_iterator( + hf_weights_files: List[str] +) -> Generator[Tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model safetensor files.""" + try: + from runai_model_streamer import SafetensorsStreamer + except ImportError as err: + raise ImportError( + "Please install Run:ai optional dependency." + "You can install it with: pip install vllm[runai]") from err + + enable_tqdm = not torch.distributed.is_initialized( + ) or torch.distributed.get_rank() == 0 + with SafetensorsStreamer() as streamer: + for st_file in tqdm( + hf_weights_files, + desc="Loading safetensors using Runai Model Streamer", + disable=not enable_tqdm, + bar_format=_BAR_FORMAT, + ): + streamer.stream_file(st_file) + yield from streamer.get_tensors() + + def pt_weights_iterator( hf_weights_files: List[str] ) -> Generator[Tuple[str, torch.Tensor], None, None]: diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 9cc43ae9181b9..55e90b9d41950 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,29 +1,48 @@ from collections.abc import Iterable -from typing import Any, TypeVar +from typing import TYPE_CHECKING, Any, Optional, TypeVar import torch import torch.nn as nn from .interfaces_base import VllmModelForPooling, is_pooling_model +if TYPE_CHECKING: + from vllm.model_executor.layers.pooler import PoolingType + _T = TypeVar("_T", bound=type[nn.Module]) +_GENERATE_SUFFIXES = [ + "ForCausalLM", + "ForConditionalGeneration", + "ChatModel", + "LMHeadModel", +] -def as_embedding_model(cls: _T) -> _T: - """Subclass an existing vLLM model to support embeddings.""" - # Avoid modifying existing embedding models - if is_pooling_model(cls): - return cls +def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str: + model_name = orig_model_name + + for generate_suffix in _GENERATE_SUFFIXES: + model_name = model_name.removesuffix(generate_suffix) + + return model_name + pooling_suffix + + +def _create_pooling_model_cls( + orig_cls: _T, + *, + default_pooling_type: "PoolingType", + default_normalize: bool, + default_softmax: bool, +) -> _T: # Lazy import from vllm.config import VllmConfig - from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput, - PoolingType) + from vllm.model_executor.layers.pooler import Pooler, PoolerOutput from vllm.model_executor.pooling_metadata import PoolingMetadata from .utils import AutoWeightsLoader, WeightsMapper - class ModelForEmbedding(cls, VllmModelForPooling): + class ModelForPooling(orig_cls, VllmModelForPooling): def __init__( self, @@ -34,7 +53,7 @@ def __init__( ) -> None: super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) - # These are not used in embedding models + # These are not used in pooling models for attr in ("lm_head", "logits_processor"): if hasattr(self, attr): delattr(self, attr) @@ -46,9 +65,9 @@ def __init__( if not getattr(self, "_pooler", None): self._pooler = Pooler.from_config_with_defaults( pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, ) def pooler( @@ -82,17 +101,148 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): return # For most other models - if hasattr(cls, "load_weights"): - cls.load_weights(self, weights) # type: ignore + if hasattr(orig_cls, "load_weights"): + orig_cls.load_weights(self, weights) # type: ignore # Fallback else: loader = AutoWeightsLoader(self) loader.load_weights(weights) - ModelForEmbedding.__name__ = cls.__name__ \ - .removesuffix("ForCausalLM") \ - .removesuffix("ForConditionalGeneration") \ - .removesuffix("ChatModel") \ - .removesuffix("LMHeadModel") + "ForEmbedding" + return ModelForPooling # type: ignore + + +def as_embedding_model(cls: _T) -> _T: + """ + Subclass an existing vLLM model to support embeddings. + + By default, the embeddings of the whole prompt are extracted from the + normalized hidden state corresponding to the last token. + + Note: + We assume that no extra layers are added to the original model; + please implement your own model if this is not the case. + """ + # Avoid modifying existing embedding models + if is_pooling_model(cls): + return cls + + # Lazy import + from vllm.model_executor.layers.pooler import PoolingType + + ModelForEmbedding = _create_pooling_model_cls( + cls, + default_pooling_type=PoolingType.LAST, + default_normalize=True, + default_softmax=False, + ) + ModelForEmbedding.__name__ = \ + _get_pooling_model_name(cls.__name__, "ForEmbedding") return ModelForEmbedding # type: ignore + + +def as_classification_model(cls: _T) -> _T: + """ + Subclass an existing vLLM model to support classification. + + By default, the class probabilities are extracted from the softmaxed + hidden state corresponding to the last token. + + Note: + We assume that the classification head is a single linear layer + stored as the attribute `score` of the top-level model; + please implement your own model if this is not the case. + """ + # Avoid modifying existing classification models + if is_pooling_model(cls): + return cls + + # Lazy import + from vllm.attention import AttentionMetadata + from vllm.config import VllmConfig + from vllm.model_executor.layers.linear import RowParallelLinear + from vllm.model_executor.layers.pooler import PoolingType + from vllm.sequence import IntermediateTensors + + from .utils import maybe_prefix + + ModelForPooling = _create_pooling_model_cls( + cls, + default_pooling_type=PoolingType.LAST, + default_normalize=False, + default_softmax=True, + ) + + class ModelForClassification(ModelForPooling): + + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.score = RowParallelLinear(config.hidden_size, + config.num_labels, + quant_config=quant_config, + input_is_parallel=False, + bias=False, + prefix=maybe_prefix( + prefix, "score")) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: list[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + hidden_states = super().forward(input_ids, positions, kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds) + logits, _ = self.score(hidden_states) + return logits + + + ModelForClassification.__name__ = \ + _get_pooling_model_name(cls.__name__, "ForClassification") + + return ModelForClassification # type: ignore + + +def as_reward_model(cls: _T) -> _T: + """ + Subclass an existing vLLM model to support reward modeling. + + By default, we return the hidden states of each token directly. + + Note: + We assume that no extra layers are added to the original model; + please implement your own model if this is not the case. + """ + # Avoid modifying existing reward models + if is_pooling_model(cls): + return cls + + # Lazy import + from vllm.model_executor.layers.pooler import PoolingType + + ModelForReward = _create_pooling_model_cls( + cls, + default_pooling_type=PoolingType.ALL, + default_normalize=False, + default_softmax=False, + ) + + ModelForReward.__name__ = \ + _get_pooling_model_name(cls.__name__, "ForReward") + + return ModelForReward # type: ignore diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index dd4b0c75cb84d..9437ad9688422 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -521,6 +521,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): This model combines a vision tower, a multi-modal projector, and a language model to perform tasks that involve both image and text inputs. """ + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "language_model.model": "language_model", + "language_model.lm_head": "lm_head", + }, + orig_to_new_suffix={ + "router.weight": "router_weight", + }, + ) def __init__( self, @@ -662,15 +671,6 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "language_model.model": "language_model", - "language_model.lm_head": "lm_head", - }, - orig_to_new_suffix={ - "router.weight": "router_weight", - }, - ) loader = AutoWeightsLoader(self) - loader.load_weights(weights, mapper=hf_to_vllm_mapper) + loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 053d838432885..c1d47b1bc9bcd 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -409,6 +409,7 @@ class BertEmbeddingModel(nn.Module): model: An instance of BertModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -441,8 +442,7 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - weights = hf_to_vllm_mapper.apply(weights) + weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) self.model.load_weights(weights) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4664aa53ea092..f4530e4771960 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -31,11 +31,14 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -326,6 +329,15 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: + if scale_name := get_compressed_tensors_cache_scale(name): + # Loading kv cache scales for compressed-tensors quantization + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = loaded_weight[0] + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: continue @@ -343,6 +355,10 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue if is_pp_missing_parameter(name, self): continue param = params_dict[name] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 9f744b6918818..63a25137f8aa9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1123,6 +1123,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + # vision backbone mapping + "image_projector.w1.": "image_projector.gate_proj.", + "image_projector.w3.": "image_projector.up_proj.", + "image_projector.w2.": "image_projector.down_proj.", + # language backbone mapping + "att_proj": "self_attn.qkv_proj", + "attn_out": "self_attn.o_proj", + "q_norm": "self_attn.q_norm", + "k_norm": "self_attn.k_norm", + "ff_proj": "mlp.gate_up_proj", + "ff_out": "mlp.down_proj", + "attn_norm": "input_layernorm", + "ff_norm": "post_attention_layernorm", + }, + orig_to_new_prefix={ + # vision backbone mapping + "model.vision_backbone.": "vision_backbone.", + # language backbone mapping + "model.transformer.blocks.": "model.layers.", + "model.transformer.ln_f.": "model.norm.", + # lm_head is renamed to model.transformer.mlp.down_proj firstly, + # we need to run a second renaming for it + "model.transformer.mlp.down_proj.": "lm_head.", + }, + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -1298,36 +1326,10 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_substr={ - # vision backbone mapping - "image_projector.w1.": "image_projector.gate_proj.", - "image_projector.w3.": "image_projector.up_proj.", - "image_projector.w2.": "image_projector.down_proj.", - # language backbone mapping - "att_proj": "self_attn.qkv_proj", - "attn_out": "self_attn.o_proj", - "q_norm": "self_attn.q_norm", - "k_norm": "self_attn.k_norm", - "ff_proj": "mlp.gate_up_proj", - "ff_out": "mlp.down_proj", - "attn_norm": "input_layernorm", - "ff_norm": "post_attention_layernorm", - }, - orig_to_new_prefix={ - # vision backbone mapping - "model.vision_backbone.": "vision_backbone.", - # language backbone mapping - "model.transformer.blocks.": "model.layers.", - "model.transformer.ln_f.": "model.norm.", - # lm_head is renamed to model.transformer.mlp.down_proj firstly, - # we need to run a second renaming for it - "model.transformer.mlp.down_proj.": "lm_head.", - }, - ) + loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) - return loader.load_weights(weights, mapper=hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def _get_weights_with_merged_embedding( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index e2263f63f7bba..4e2e7f5761544 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -408,6 +408,13 @@ def _get_dummy_mm_inputs( @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.vision_embed_tokens.wte": "embed_tokens", + "model.vision_embed_tokens.": "vision_embed_tokens.", + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -616,17 +623,10 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "model.vision_embed_tokens.wte": "embed_tokens", - "model.vision_embed_tokens.": "vision_embed_tokens.", - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, - mapper=hf_to_vllm_mapper) + mapper=self.hf_to_vllm_mapper) # The HF config doesn't specify whether these are tied, # so we detect it this way diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6676dd16e005f..f3d66c2313198 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -45,8 +45,12 @@ except ImportError: USE_XFORMERS_OPS = False -PIXTRAL_IMAGE_BREAK_ID = 12 -PIXTRAL_IMAGE_END_ID = 13 +# These token ids cannot be retrieved from model config +# so we hardcode them here. +PIXTRAL_12B_IMAGE_BREAK_ID = 12 +PIXTRAL_12B_IMAGE_END_ID = 13 +PIXTRAL_LARGE_IMAGE_BREAK_ID = 14 +PIXTRAL_LARGE_IMAGE_END_ID = 15 def get_max_pixtral_image_tokens(ctx: InputContext): @@ -118,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext, for image_data in data_list: image = ImageChunk(image=image_data) encoding = tokenizer.instruct.mm_encoder(image) - image = torch.from_numpy(encoding.image).to(device="cuda", - dtype=torch.float16) + image = torch.from_numpy(encoding.image).to(dtype=torch.float16) images.append(image) image_tokens_list.append(encoding.tokens) @@ -237,8 +240,9 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: # NOTE: Image embeddings are split into separate tensors for each image # by the indices of `[IMG_END]` token. - split_indices = torch.where( - image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1 + image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | ( + image_tokens == PIXTRAL_LARGE_IMAGE_END_ID) + split_indices = torch.where(image_end_condition)[0] + 1 if len(split_indices) <= 1: # Do not split, return as tensor of shape [1, fs, hs] return image_embeds.unsqueeze(0) @@ -260,8 +264,11 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [ - self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID, - PIXTRAL_IMAGE_BREAK_ID + self.vision_args.image_token_id, + PIXTRAL_12B_IMAGE_END_ID, + PIXTRAL_12B_IMAGE_BREAK_ID, + PIXTRAL_LARGE_IMAGE_BREAK_ID, + PIXTRAL_LARGE_IMAGE_END_ID, ]) return inputs_embeds diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 3ce4eb5869f21..88f4ea4352726 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -529,6 +529,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -543,8 +545,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM), - # after changing the default pooling method + # TODO: Replace this model class with as_embedding_model( + # Qwen2ForCausalLM) after changing the default pooling method if pooler_config.pooling_type is None: logger.warning( "This embedding model will default to last-token pooling in " @@ -577,8 +579,7 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - weights = hf_to_vllm_mapper.apply(weights) + weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) self.model.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py deleted file mode 100644 index dc5dabf6fc38b..0000000000000 --- a/vllm/model_executor/models/qwen2_cls.py +++ /dev/null @@ -1,104 +0,0 @@ -# Adapted from -# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py -# Copyright 2024 Kakao Corp. (Kanana-X Team) -# Copyright 2024 The Qwen team. -# Copyright 2023 The vLLM team. -"""Inference-only Qwen2-Classification model compatible with HF weights.""" -from typing import Iterable, List, Optional, Set, Tuple - -import torch -from torch import nn - -from vllm.attention import AttentionMetadata -from vllm.config import VllmConfig -from vllm.model_executor.layers.linear import RowParallelLinear -from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.sequence import IntermediateTensors, PoolerOutput - -from .interfaces import SupportsLoRA, SupportsPP -from .utils import AutoWeightsLoader, maybe_prefix - - -class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - ] - embedding_modules = {} - embedding_padding_modules = [] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = Qwen2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - # hidden_states from Qwen2Model has been reduced, - # the input of score layer is not parallelized. - self.score = RowParallelLinear(config.hidden_size, - config.num_labels, - quant_config=quant_config, - input_is_parallel=False, - bias=False, - prefix=maybe_prefix(prefix, "score")) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, - inputs_embeds) - logits, _ = self.score(hidden_states) - return logits - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - loader = AutoWeightsLoader(self, - ignore_unexpected_prefixes=["lm_head."]) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b38ea923f0bf1..fb97eb1916002 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -901,6 +901,11 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ] embedding_modules = {} embedding_padding_modules = [] + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -1190,11 +1195,6 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 04d806c3c7eae..b32a3421d5841 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -20,11 +20,10 @@ from vllm.logger import init_logger from vllm.platforms import current_platform -from .adapters import as_embedding_model from .interfaces import (has_inner_state, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_pp) -from .interfaces_base import is_pooling_model, is_text_generation_model +from .interfaces_base import is_text_generation_model logger = init_logger(__name__) @@ -125,12 +124,13 @@ "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), - "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + # [Auto-converted (see adapters.py)] + "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"), } _CROSS_ENCODER_MODELS = { @@ -226,19 +226,10 @@ class _ModelInfo: @staticmethod def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": - is_pooling_model_ = is_pooling_model(model) - if not is_pooling_model_: - try: - as_embedding_model(model) - except Exception: - pass - else: - is_pooling_model_ = True - return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), - is_pooling_model=is_pooling_model_, + is_pooling_model=True, # Can convert any model into a pooling model supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_pp=supports_pp(model), diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 39c9103527f01..28c37bb96612c 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -31,6 +31,19 @@ class TeleChat2Model(LlamaModel): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "transformer.": "model.", + }, + orig_to_new_substr={ + ".h.": ".layers.", + ".self_attention.": ".self_attn.", + ".word_embeddings.": ".embed_tokens.", + ".dense.": ".o_proj.", + ".ln_f.": ".norm.", + }, + ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # 1. Initialize the LlamaModel with bias vllm_config.model_config.hf_config.bias = True @@ -111,21 +124,9 @@ def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "transformer.": "model.", - }, - orig_to_new_substr={ - ".h.": ".layers.", - ".self_attention.": ".self_attn.", - ".word_embeddings.": ".embed_tokens.", - ".dense.": ".o_proj.", - ".ln_f.": ".norm.", - }, - ) loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights, mapper=hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c60b208c3d27d..509ad9e580ddf 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -302,6 +302,9 @@ def forward( @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -494,9 +497,7 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) - return loader.load_weights(weights, mapper=hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 928c31a2f2843..9255e062e4870 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index c92deddbcb255..314d21b746236 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np import numpy.typing as npt @@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: "There is no default maximum multimodal tokens") +def try_import_audio_packages() -> tuple[Any, Any]: + try: + import librosa + import soundfile + except ImportError as exc: + raise ImportError( + "Please install vllm[audio] for audio support.") from exc + return librosa, soundfile + + def resample_audio( audio: npt.NDArray[np.floating], *, diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fe77a4635f7d8..1e5a46946c6c0 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): (i.e., the modality of the data). See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ def __init__(self) -> None: @@ -94,8 +94,8 @@ def register_input_mapper( If `None` is provided, then the default input mapper is used instead. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -130,8 +130,8 @@ def map_input( TypeError: If the data type is not supported. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ # Avoid circular import @@ -190,7 +190,7 @@ def register_max_multimodal_tokens( If `None` is provided, then the default calculation is used instead. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 97bbce1ce1570..c705e1a3d1554 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -84,3 +84,15 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 3000 + + +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: + """Rescale the dimensions of an image by a constant factor.""" + new_width = int(image.width * size_factor) + new_height = int(image.height * size_factor) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 138cc6a44c11a..9ecae2c1ca2bf 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -75,7 +75,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalDataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. + Read more on that :ref:`here `. """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6cd79d414c978..ded45a7184b5d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None: Register a multi-modal plugin so it can be recognized by vLLM. See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ data_type_key = plugin.get_data_key() @@ -311,8 +311,8 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c898ca4e6573e..1cb9036bdfda3 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,7 +2,7 @@ import os from functools import lru_cache from io import BytesIO -from typing import Any, List, Optional, Tuple, TypeVar, Union +from typing import List, Optional, Tuple, TypeVar, Union import numpy as np import numpy.typing as npt @@ -14,7 +14,9 @@ from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .audio import try_import_audio_packages from .inputs import MultiModalDataDict, PlaceholderRange +from .video import try_import_video_packages logger = init_logger(__name__) @@ -125,17 +127,7 @@ async def async_fetch_image(image_url: str, return image.convert(image_mode) -def _load_video_frames_from_bytes(b: bytes): - frame = Image.open(BytesIO(b)) - return np.array(frame) - - -def load_video_frames_from_base64(frame: Union[bytes, str]): - """Load frame from base64 format.""" - return _load_video_frames_from_bytes(base64.b64decode(frame)) - - -def _load_video_from_bytes(b: bytes, num_frames: int = 32): +def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: _, decord = try_import_video_packages() video_path = BytesIO(b) @@ -155,13 +147,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32): return frames -def _load_video_from_data_url(video_url: str): - # Only split once and assume the second part is the base64 encoded image - frames_base64 = video_url.split(",")[1:] - return np.stack([ - load_video_frames_from_base64(frame_base64) - for frame_base64 in frames_base64 - ]) +def _load_video_from_data_url(video_url: str) -> npt.NDArray: + # Only split once and assume the second part is the base64 encoded video + _, video_base64 = video_url.split(",", 1) + + if video_url.startswith("data:video/jpeg;"): + return np.stack([ + np.array(load_image_from_base64(frame_base64)) + for frame_base64 in video_base64.split(",") + ]) + + return load_video_from_base64(video_base64) def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: @@ -204,16 +200,6 @@ async def async_fetch_video(video_url: str, return video -def try_import_audio_packages() -> Tuple[Any, Any]: - try: - import librosa - import soundfile - except ImportError as exc: - raise ImportError( - "Please install vllm[audio] for audio support.") from exc - return librosa, soundfile - - def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. @@ -330,61 +316,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: return _load_image_from_bytes(base64.b64decode(image)) -def rescale_image_size(image: Image.Image, - size_factor: float, - transpose: int = -1) -> Image.Image: - """Rescale the dimensions of an image by a constant factor.""" - new_width = int(image.width * size_factor) - new_height = int(image.height * size_factor) - image = image.resize((new_width, new_height)) - if transpose >= 0: - image = image.transpose(Image.Transpose(transpose)) - return image - - -def try_import_video_packages() -> Any: - try: - import cv2 - import decord - except ImportError as exc: - raise ImportError( - "Please install vllm[video] for video support.") from exc - return cv2, decord - - -def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray: - cv2, _ = try_import_video_packages() - - num_frames, _, _, channels = frames.shape - new_height, new_width = size - resized_frames = np.empty((num_frames, new_height, new_width, channels), - dtype=frames.dtype) - for i, frame in enumerate(frames): - resized_frame = cv2.resize(frame, (new_width, new_height)) - resized_frames[i] = resized_frame - return resized_frames - - -def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: - _, height, width, _ = frames.shape - new_height = int(height * size_factor) - new_width = int(width * size_factor) - - return resize_video(frames, (new_height, new_width)) - - -def sample_frames_from_video(frames: npt.NDArray, - num_frames: int) -> npt.NDArray: - total_frames = frames.shape[0] - if num_frames == -1: - return frames - else: - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames - - -def encode_video_base64(frames: npt.NDArray): +def encode_video_base64(frames: npt.NDArray) -> str: base64_frames = [] frames_list = [frames[i] for i in range(frames.shape[0])] for frame in frames_list: @@ -393,6 +325,11 @@ def encode_video_base64(frames: npt.NDArray): return ",".join(base64_frames) +def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray: + """Load video from base64 format.""" + return _load_video_from_bytes(base64.b64decode(video)) + + def resolve_visual_encoder_outputs( encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], feature_sample_layers: Optional[list[int]], diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ba9bf58a4a20c..bfcdef70718bc 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np +import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.logger import init_logger @@ -75,3 +76,45 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 4096 + + +def try_import_video_packages() -> tuple[Any, Any]: + try: + import cv2 + import decord + except ImportError as exc: + raise ImportError( + "Please install vllm[video] for video support.") from exc + return cv2, decord + + +def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: + cv2, _ = try_import_video_packages() + + num_frames, _, _, channels = frames.shape + new_height, new_width = size + resized_frames = np.empty((num_frames, new_height, new_width, channels), + dtype=frames.dtype) + for i, frame in enumerate(frames): + resized_frame = cv2.resize(frame, (new_width, new_height)) + resized_frames[i] = resized_frame + return resized_frames + + +def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: + _, height, width, _ = frames.shape + new_height = int(height * size_factor) + new_width = int(width * size_factor) + + return resize_video(frames, (new_height, new_width)) + + +def sample_frames_from_video(frames: npt.NDArray, + num_frames: int) -> npt.NDArray: + total_frames = frames.shape[0] + if num_frames == -1: + return frames + + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + sampled_frames = frames[frame_indices, ...] + return sampled_frames diff --git a/vllm/outputs.py b/vllm/outputs.py index 2ecdf74ee59b3..b519c159b1531 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -355,7 +355,8 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": pooled_data = seq_group.pooled_data assert pooled_data is not None - output = PoolingOutput(pooled_data) + data = pooled_data.to(dtype=torch.float32, device="cpu") + output = PoolingOutput(data) prompt_token_ids = seq_group.prompt_token_ids finished = seq_group.is_finished() diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index d95a2b4cd5565..09bde9f065eaa 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/scripts.py b/vllm/scripts.py index a51c21cfa29e7..42e1c639eda10 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -165,7 +165,7 @@ def main(): required=False, help="Read CLI options from a config file." "Must be a YAML with the following options:" - "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server" + "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) serve_parser = make_arg_parser(serve_parser) serve_parser.set_defaults(dispatch_function=serve) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 2689802161987..de593113b938b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py new file mode 100644 index 0000000000000..6f63dab74d696 --- /dev/null +++ b/vllm/transformers_utils/s3_utils.py @@ -0,0 +1,146 @@ +import fnmatch +import os +import shutil +import signal +import tempfile +from pathlib import Path +from typing import Optional + +import boto3 + + +def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]: + return [ + path for path in paths if any( + fnmatch.fnmatch(path, pattern) for pattern in patterns) + ] + + +def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: + return [ + path for path in paths + if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns) + ] + + +def glob(s3=None, + path: str = "", + allow_pattern: Optional[list[str]] = None) -> list[str]: + """ + List full file names from S3 path and filter by allow pattern. + + Args: + s3: S3 client to use. + path: The S3 path to list from. + allow_pattern: A list of patterns of which files to pull. + + Returns: + list[str]: List of full S3 paths allowed by the pattern + """ + if s3 is None: + s3 = boto3.client("s3") + bucket_name, _, paths = list_files(s3, + path=path, + allow_pattern=allow_pattern) + return [f"s3://{bucket_name}/{path}" for path in paths] + + +def list_files( + s3, + path: str, + allow_pattern: Optional[list[str]] = None, + ignore_pattern: Optional[list[str]] = None +) -> tuple[str, str, list[str]]: + """ + List files from S3 path and filter by pattern. + + Args: + s3: S3 client to use. + path: The S3 path to list from. + allow_pattern: A list of patterns of which files to pull. + ignore_pattern: A list of patterns of which files not to pull. + + Returns: + tuple[str, str, list[str]]: A tuple where: + - The first element is the bucket name + - The second element is string represent the bucket + and the prefix as a dir like string + - The third element is a list of files allowed or + disallowed by pattern + """ + parts = path.removeprefix('s3://').split('/') + prefix = '/'.join(parts[1:]) + bucket_name = parts[0] + + objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + paths = [obj['Key'] for obj in objects.get('Contents', [])] + + paths = _filter_ignore(paths, ["*/"]) + if allow_pattern is not None: + paths = _filter_allow(paths, allow_pattern) + + if ignore_pattern is not None: + paths = _filter_ignore(paths, ignore_pattern) + + return bucket_name, prefix, paths + + +class S3Model: + """ + A class representing a S3 model mirrored into a temporary directory. + + Attributes: + s3: S3 client. + dir: The temporary created directory. + + Methods: + pull_files(): Pull model from S3 to the temporary directory. + """ + + def __init__(self) -> None: + self.s3 = boto3.client('s3') + for sig in (signal.SIGINT, signal.SIGTERM): + existing_handler = signal.getsignal(sig) + signal.signal(sig, self._close_by_signal(existing_handler)) + self.dir = tempfile.mkdtemp() + + def __del__(self): + self._close() + + def _close(self) -> None: + if os.path.exists(self.dir): + shutil.rmtree(self.dir) + + def _close_by_signal(self, existing_handler=None): + + def new_handler(signum, frame): + self._close() + if existing_handler: + existing_handler(signum, frame) + + return new_handler + + def pull_files(self, + s3_model_path: str = "", + allow_pattern: Optional[list[str]] = None, + ignore_pattern: Optional[list[str]] = None) -> None: + """ + Pull files from S3 storage into the temporary directory. + + Args: + s3_model_path: The S3 path of the model. + allow_pattern: A list of patterns of which files to pull. + ignore_pattern: A list of patterns of which files not to pull. + + """ + bucket_name, base_dir, files = list_files(self.s3, s3_model_path, + allow_pattern, + ignore_pattern) + if len(files) == 0: + return + + for file in files: + destination_file = self.dir + file.removeprefix(base_dir) + local_dir = Path(destination_file).parent + os.makedirs(local_dir, exist_ok=True) + self.s3.download_file(bucket_name, file, destination_file) diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 7a9041b04fbb9..10a09fb4f566c 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -3,6 +3,10 @@ from typing import Union +def is_s3(model_or_path: str) -> bool: + return model_or_path.lower().startswith('s3://') + + def check_gguf_file(model: Union[str, PathLike]) -> bool: """Check if the file is a GGUF model.""" model = Path(model) diff --git a/vllm/utils.py b/vllm/utils.py index 8eb5630bee0e0..67b35183757c5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -51,7 +51,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 41fb4b25d45bb..cfdbea8004c35 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -98,7 +98,7 @@ def from_engine_args( start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "AsyncLLMEngine": + ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" # Create the engine configs. @@ -386,7 +386,3 @@ def errored(self) -> bool: @property def dead_error(self) -> BaseException: return Exception() # TODO: implement - - -# Retain V0 name for backwards compatibility. -AsyncLLMEngine = AsyncLLM diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bea8c5502f612..b58f62778ffe9 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -110,7 +110,10 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "mp": + if distributed_executor_backend == "ray": + from vllm.v1.executor.ray_executor import RayExecutor + executor_class = RayExecutor + elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor else: diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index 218724bff6bba..8bfc739b3dbbc 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -180,6 +180,10 @@ def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]: return None mm_data = prompt["multi_modal_data"] + if not mm_data: + # mm_data can be None or an empty dict. + return None + image_inputs = mm_data["image"] return self.hash_images(image_inputs) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py new file mode 100644 index 0000000000000..79acc60001c99 --- /dev/null +++ b/vllm/v1/executor/ray_executor.py @@ -0,0 +1,342 @@ +import os +from collections import defaultdict +from itertools import islice, repeat +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.v1.executor.abstract import Executor +from vllm.v1.executor.ray_utils import (RayWorkerWrapper, + initialize_ray_cluster, ray) +from vllm.v1.outputs import ModelRunnerOutput + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +class RayExecutor(Executor): + + def __init__(self, vllm_config: VllmConfig) -> None: + self.vllm_config = vllm_config + self.parallel_config = vllm_config.parallel_config + self.model_config = vllm_config.model_config + self.forward_dag: Optional[ray.dag.CompiledDAG] = None + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + initialize_ray_cluster(self.parallel_config) + placement_group = self.parallel_config.placement_group + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + # A list of workers to run a model. + self.workers: List[RayWorkerWrapper] = [] + if self.parallel_config.ray_workers_use_nsight: + ray_remote_kwargs = self._configure_ray_workers_use_nsight( + ray_remote_kwargs) + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + # Skip bundles that don't have GPUs, + # as each worker needs one GPU. + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + worker = ray.remote( + num_cpus=0, + num_gpus=1, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) + self.workers.append(worker) + + logger.debug("workers: %s", self.workers) + worker_ips = [ + ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] + for worker in self.workers + ] + ip_counts: Dict[str, int] = {} + for ip in worker_ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + + worker_to_ip = dict(zip(self.workers, worker_ips)) + + def sort_by_driver_then_worker_ip(worker): + """ + Sort the workers based on 3 properties: + 1. If the worker is on the same node as the driver (vllm engine), + it should be placed first. + 2. Then, if the worker is on a node with fewer workers, it should + be placed first. + 3. Finally, if the work is on a node with smaller IP address, it + should be placed first. This is simply a tiebreaker to make + sure the workers are sorted in a deterministic way. + """ + ip = worker_to_ip[worker] + return (ip != driver_ip, ip_counts[ip], ip) + + # After sorting, the workers on the same node will be + # close to each other, and the workers on the driver + # node will be placed first. + self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids") + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] + node_gpus[node_id].extend(gpu_ids) + + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + all_ips = set(worker_ips) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP` or " + "`HOST_IP` environment variable, make sure it is unique for" + " each node.") + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "CUDA_VISIBLE_DEVICES": + ",".join(map(str, node_gpus[node_id])), + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + "VLLM_USE_V1": + str(int(envs.VLLM_USE_V1)), + **({ + "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND + } if envs.VLLM_ATTENTION_BACKEND is not None else {}) + }, ) for (node_id, _) in worker_node_and_gpu_ids] + + self._env_vars_for_all_workers = ( + all_args_to_update_environment_variables) + + self._run_workers("update_environment_variables", + all_args=self._get_env_vars_to_be_updated()) + + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + self._run_workers("initialize") + self._run_workers("load_model") + + def _configure_ray_workers_use_nsight(self, + ray_remote_kwargs) -> Dict[str, Any]: + # If nsight profiling is enabled, we need to set the profiling + # configuration for the ray workers as runtime env. + runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) + runtime_env.update({ + "nsight": { + "t": "cuda,cudnn,cublas", + "o": "'worker_process_%p'", + "cuda-graph-trace": "node", + } + }) + + return ray_remote_kwargs + + def _get_env_vars_to_be_updated(self): + return self._env_vars_for_all_workers + + def _get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None) -> Dict[str, Any]: + """ + Return worker init args for a given rank. + """ + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + ) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """ + Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - tuple[num_gpu_blocks, num_cpu_blocks] + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers("determine_num_available_blocks") + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + return num_gpu_blocks, num_cpu_blocks + + def initialize(self, num_gpu_blocks: int) -> None: + """ + Initialize the KV cache in all workers. + """ + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# GPU blocks: %d", num_gpu_blocks) + self._run_workers("initialize_cache", num_gpu_blocks) + self._run_workers("compile_or_warm_up_model") + + def _run_workers( + self, + method: str, + *args, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + **kwargs, + ) -> Any: + """ + Runs the given method on all workers. Can be used in the following + ways: + + Args: + - args/kwargs: All workers share the same args/kwargs + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 0, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 0, None) + + ray_worker_refs = [ + worker.execute_method.remote( # type: ignore[attr-defined] + method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + return ray.get(ray_worker_refs) + + def execute_model( + self, + scheduler_output, + ) -> ModelRunnerOutput: + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag() + # Only the first worker (with rank 0) returns the execution result. + # Others return None. + output = ray.get(self.forward_dag.execute(scheduler_output))[0] + return output + + def profile(self, is_start=True): + raise NotImplementedError + + def shutdown(self): + if hasattr(self, "forward_dag") and self.forward_dag is not None: + self.forward_dag.teardown() + import ray + for worker in self.workers: + ray.kill(worker) + self.forward_dag = None + + def check_health(self) -> None: + logger.debug("Called check_health.") + + def _check_ray_compiled_graph_installation(self): + import pkg_resources + from packaging import version + + required_version = version.parse("2.39") + current_version = version.parse( + pkg_resources.get_distribution("ray").version) + if current_version < required_version: + raise ValueError(f"Ray version {required_version} is " + f"required, but found {current_version}") + + import importlib.util + raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref") + if raycg is None: + raise ValueError("Ray Compiled Graph is not installed. " + "Run `pip install ray[adag]` to install it.") + + cupy_spec = importlib.util.find_spec("cupy") + if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: + raise ValueError( + "cupy is not installed but required since " + "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." + "Run `pip install ray[adag]` and check cupy installation.") + + def _compiled_ray_dag(self): + assert self.parallel_config.use_ray + self._check_ray_compiled_graph_installation() + from ray.dag import InputNode, MultiOutputNode + + with InputNode() as input_batches: + outputs = [ + worker.execute_model.bind( # type: ignore[attr-defined] + input_batches) for worker in self.workers + ] + forward_dag = MultiOutputNode(outputs) + + return forward_dag.experimental_compile() + + def __del__(self): + self.shutdown() diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py new file mode 100644 index 0000000000000..7733610e59c7f --- /dev/null +++ b/vllm/v1/executor/ray_utils.py @@ -0,0 +1,271 @@ +import time +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple + +from vllm.config import ParallelConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import get_ip +from vllm.v1.outputs import ModelRunnerOutput +from vllm.worker.worker_base import WorkerWrapperBase + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + +logger = init_logger(__name__) +PG_WAIT_TIMEOUT = 60 + +try: + import ray + from ray.util import placement_group_table + from ray.util.placement_group import PlacementGroup + try: + from ray._private.state import available_resources_per_node + except ImportError: + # Ray 2.9.x doesn't expose `available_resources_per_node` + from ray._private.state import state as _state + available_resources_per_node = _state._available_resources_per_node + + class RayWorkerWrapper(WorkerWrapperBase): + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + # Since the compiled DAG runs a main execution + # in a different thread that calls cuda.set_device. + # The flag indicates is set_device is called on + # that thread. It will be removed soon. + self.compiled_dag_cuda_device_set = False + + def get_node_ip(self) -> str: + return get_ip() + + def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: + node_id = ray.get_runtime_context().get_node_id() + gpu_ids = ray.get_gpu_ids() + return node_id, gpu_ids + + def setup_device_if_necessary(self): + # TODO(swang): This is needed right now because Ray CG executes + # on a background thread, so we need to reset torch's current + # device. + # We can remove this API after it is fixed in compiled graph. + import torch + assert self.worker is not None, "Worker is not initialized" + if not self.compiled_dag_cuda_device_set: + torch.cuda.set_device(self.worker.device) + self.compiled_dag_cuda_device_set = True + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + self.setup_device_if_necessary() + assert self.worker is not None, "Worker is not initialized" + output = self.worker.model_runner.execute_model(scheduler_output) + return output + + ray_import_err = None + +except ImportError as e: + ray = None # type: ignore + ray_import_err = e + RayWorkerWrapper = None # type: ignore + + +def ray_is_available() -> bool: + """Returns True if Ray is available.""" + return ray is not None + + +def assert_ray_available(): + """ + Raise an exception if Ray is not available. + """ + if ray is None: + raise ValueError("Failed to import Ray, please install Ray with " + "`pip install ray`.") from ray_import_err + + +def _verify_bundles(placement_group: "PlacementGroup", + parallel_config: ParallelConfig, device_str: str): + """ + Verify a given placement group has bundles located in the right place. + + There are 2 rules. + - Warn if all tensor parallel workers cannot fit in a single node. + - Fail if driver node is not included in a placement group. + + Args: + placement_group: The placement group to verify. + parallel_config: The parallel configuration. + device_str: The required device. + """ + assert ray.is_initialized(), ( + "Ray is not initialized although distributed-executor-backend is ray.") + pg_data = placement_group_table(placement_group) + # bundle_idx -> node_id + bundle_to_node_ids = pg_data["bundles_to_node_id"] + # bundle_idx -> bundle (e.g., {"GPU": 1}) + bundles = pg_data["bundles"] + # node_id -> List of bundle (e.g., {"GPU": 1}) + node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) + + for bundle_idx, node_id in bundle_to_node_ids.items(): + node_id_to_bundle[node_id].append(bundles[bundle_idx]) + driver_node_id = ray.get_runtime_context().get_node_id() + + if driver_node_id not in node_id_to_bundle: + raise RuntimeError( + f"driver node id {driver_node_id} is not included in a placement " + f"group {placement_group.id}. Node id -> bundles " + f"{node_id_to_bundle}. " + "You don't have enough GPUs available in a current node. Check " + "`ray status` to see if you have available GPUs in a node " + f"{driver_node_id} before starting an vLLM engine.") + + for node_id, bundles in node_id_to_bundle.items(): + if len(bundles) < parallel_config.tensor_parallel_size: + logger.warning( + "tensor_parallel_size=%d " + "is bigger than a reserved number of %ss (%d " + "%ss) in a node %s. Tensor parallel workers can be " + "spread out to 2+ nodes which can degrade the performance " + "unless you have fast interconnect across nodes, like " + "Infiniband. To resolve this issue, make sure you have more " + "than %d GPUs available at each node.", + parallel_config.tensor_parallel_size, device_str, len(bundles), + device_str, node_id, parallel_config.tensor_parallel_size) + + +def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): + """Wait until a placement group is ready. + + It prints the informative log messages if the placement group is + not created within time. + + """ + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + placement_group_specs = current_placement_group.bundle_specs + + s = time.time() + pg_ready_ref = current_placement_group.ready() + wait_interval = 10 + while time.time() - s < PG_WAIT_TIMEOUT: + ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) + if len(ready) > 0: + break + + # Exponential backoff for warning print. + wait_interval *= 2 + logger.info( + "Waiting for creating a placement group of specs for " + "%d seconds. specs=%s. Check " + "`ray status` to see if you have enough resources.", + int(time.time() - s), placement_group_specs) + + try: + ray.get(pg_ready_ref, timeout=0) + except ray.exceptions.GetTimeoutError: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " + "`ray status` to make sure the cluster has enough resources." + ) from None + + +def initialize_ray_cluster( + parallel_config: ParallelConfig, + ray_address: Optional[str] = None, +): + """Initialize the distributed cluster with Ray. + + it will connect to the Ray cluster and create a placement group + for the workers, which includes the specification of the resources + for each distributed worker. + + Args: + parallel_config: The configurations for parallel execution. + ray_address: The address of the Ray cluster. If None, uses + the default Ray cluster address. + """ + assert_ray_available() + + # Connect to a ray cluster. + if current_platform.is_rocm() or current_platform.is_xpu(): + # Try to connect existing ray instance and create a new one if not found + try: + ray.init("auto") + except ConnectionError: + logger.warning( + "No existing RAY instance detected. " + "A new instance will be launched with current node resources.") + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + + if parallel_config.placement_group: + # Placement group is already set. + return + + device_str = "GPU" if not current_platform.is_tpu() else "TPU" + # Create placement group for worker processes + current_placement_group = ray.util.get_current_placement_group() + if current_placement_group: + # We are in a placement group + bundles = current_placement_group.bundle_specs + # Verify that we can use the placement group. + device_bundles = 0 + for bundle in bundles: + bundle_devices = bundle.get(device_str, 0) + if bundle_devices > 1: + raise ValueError( + "Placement group bundle cannot have more than 1 " + f"{device_str}.") + if bundle_devices: + device_bundles += 1 + if parallel_config.world_size > device_bundles: + raise ValueError( + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group." + f"Required number of devices: {parallel_config.world_size}. " + f"Total number of devices: {device_bundles}.") + else: + num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + if parallel_config.world_size > num_devices_in_cluster: + raise ValueError( + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group.") + # Create a new placement group + placement_group_specs: List[Dict[str, float]] = ([{ + device_str: 1.0 + } for _ in range(parallel_config.world_size)]) + + # vLLM engine is also a worker to execute model with an accelerator, + # so it requires to have the device in a current node. Check if + # the current node has at least one device. + current_ip = get_ip() + current_node_id = ray.get_runtime_context().get_node_id() + current_node_resource = available_resources_per_node()[current_node_id] + if current_node_resource.get(device_str, 0) < 1: + raise ValueError( + f"Current node has no {device_str} available. " + f"{current_node_resource=}. vLLM engine cannot start without " + f"{device_str}. Make sure you have at least 1 {device_str} " + f"available in a node {current_node_id=} {current_ip=}.") + # This way, at least bundle is required to be created in a current + # node. + placement_group_specs[0][f"node:{current_ip}"] = 0.001 + + # By default, Ray packs resources as much as possible. + current_placement_group = ray.util.placement_group( + placement_group_specs, strategy="PACK") + _wait_until_pg_ready(current_placement_group) + + assert current_placement_group is not None + _verify_bundles(current_placement_group, parallel_config, device_str) + # Set the placement group in the parallel config + parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cb89246db0cc9..ace62d8978bea 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -635,17 +635,6 @@ def profile_run(self) -> None: ) dummy_mm_data = dummy_request_data.multi_modal_data - # Compute MM hashes (if enabled) - mm_hashes = None - if self.use_hash: - mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data) - - dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs( - mm_data=dummy_mm_data, - mm_hashes=mm_hashes, - mm_processor_kwargs=None, - precomputed_mm_inputs=None) - # NOTE: Currently model is profiled with a single non-text # modality even when it supports multiple. max_tokens_per_mm_item = max( @@ -660,8 +649,39 @@ def profile_run(self) -> None: # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 # they are scheduled to be processed separately. + + # Case when models have a merged processor, their dummy data is + # already batched `MultiModalKwargs`, therefore we need to "unbatch" + # and take the first item in each batched tensor. + # TODO (ywang96): This is somewhat hacky. Refactor this to be + # consistent with the other case. + if isinstance(dummy_mm_data, MultiModalKwargs): + dummy_mm_kwargs = { + k: v[0].unsqueeze(0) + for k, v in dummy_mm_data.items() + } + + # Case when models have dummy data explicitly defined as + # `MultiModalDataDict`, so they need to be processed through input + # mapper. + else: + # Compute MM hashes (if enabled) + mm_hashes = None + if self.use_hash: + mm_hashes = self.mm_hasher.hash_dummy_mm_data( + dummy_mm_data) + + mm_kwargs_list = self.mm_input_mapper_client.process_inputs( + mm_data=dummy_mm_data, + mm_hashes=mm_hashes, + mm_processor_kwargs=None, + precomputed_mm_inputs=None) + + # Take the first `MultiModalKwargs` + dummy_mm_kwargs = mm_kwargs_list[0] + batched_dummy_mm_inputs = MultiModalKwargs.batch( - [dummy_mm_kwargs[0]] * max_num_mm_items) + [dummy_mm_kwargs] * max_num_mm_items) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 33491f700de10..0000b09bfaa36 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -202,7 +202,6 @@ def execute_model( ) -> ModelRunnerOutput: output = self.model_runner.execute_model(scheduler_output) return output if self.rank == 0 else None - return output def profile(self, is_start: bool = True): if self.profiler is None: diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 420aaf8a1b4cd..f1531e0fc0675 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -114,8 +114,7 @@ class ModelInputData: def __init__(self, use_mrope: bool): self.use_mrope = use_mrope self.input_tokens: List[int] = [] - self.input_positions: Optional[ - List[int]] = [] if not self.use_mrope else None + self.input_positions: List[int] = [] self.token_type_ids: Optional[List[int]] = [] self.seq_lens: List[int] = [] self.query_lens: List[int] = [] @@ -130,9 +129,8 @@ def __init__(self, use_mrope: bool): self.multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict( MultiModalPlaceholderMap) - self.input_mrope_positions: Optional[List[List[int]]] = [ - [] for _ in range(3) - ] if self.use_mrope else None + self.input_mrope_positions: List[List[int]] = [[] + for _ in range(3)] def __init__(self, runner: "CPUModelRunner", @@ -167,7 +165,8 @@ def build(self) -> ModelInputForCPU: device="cpu") input_positions = torch.tensor( input_data.input_positions - if not input_data.use_mrope else input_data.input_mrope_positions, + if not any(input_data.input_mrope_positions) else + input_data.input_mrope_positions, dtype=torch.long, device="cpu") token_type_ids = torch.tensor(input_data.token_type_ids, @@ -236,7 +235,7 @@ def _compute_decode_input_tokens(self, data: ModelInputData, block_table = block_table[start_block:] # For MRotaryEmbedding - if data.input_positions is None: + if seq_data.mrope_position_delta is not None: next_pos = MRotaryEmbedding.get_next_input_positions( seq_data.mrope_position_delta, context_len, @@ -309,8 +308,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData, data.slot_mapping.extend(slot_mapping) # The MROPE positions are prepared in _compute_multi_modal_input - if data.input_positions is not None: - data.input_positions.extend(token_positions) + data.input_positions.extend(token_positions) if data.token_type_ids is not None: data.token_type_ids.extend(token_types if token_types else []) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 09758a5d9accf..b5dfebfce6f75 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -333,9 +333,8 @@ def execute_worker( def prepare_worker_input( self, execute_model_req: ExecuteModelRequest) -> WorkerInput: assert execute_model_req is not None - virtual_engine = execute_model_req.virtual_engine + virtual_engine: int = execute_model_req.virtual_engine num_seq_groups: int = len(execute_model_req.seq_group_metadata_list) - blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, device="cpu", dtype=torch.int64).view(-1, 2) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 18b03bf1bfb56..65d9bab0e2822 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -406,8 +406,9 @@ def _async_process_outputs(self, model_input: StatefulModelInput, if not cont: break - def _final_process_outputs(self, model_input: StatefulModelInput, - output_proc_callback: Optional[Callable]): + def _final_process_outputs( + self, model_input: StatefulModelInput, + output_proc_callback: Optional[Callable]) -> List[SamplerOutput]: assert model_input.frozen_model_input is not None has_async_callback = output_proc_callback is not None @@ -594,8 +595,8 @@ def execute_model( # should be [SamplerOutput] return output - def _update_sampling_metadata(self, sampling_metadata, num_seqs, - num_queries): + def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata, + num_seqs: Optional[int], num_queries: int): assert sampling_metadata.num_prompts == 0 assert len(sampling_metadata.seq_groups) == num_queries @@ -820,7 +821,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: @@ -850,13 +851,13 @@ def _pythonize_sampler_output( seq_ids = seq_group.seq_ids next_token_ids = sample_result parent_ids = [0] + seq_outputs: List[SequenceOutput] if cache is not None: completion_seq_group_output: CompletionSequenceGroupOutput = \ cache.cached_completion_seq_group_output.get_object() completion_seq_group_output.samples.clear() - seq_outputs: List[ - SequenceOutput] = completion_seq_group_output.samples + seq_outputs = completion_seq_group_output.samples else: seq_outputs = [] diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 5f71ec0c14df8..8f2d343440d3e 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 6d00102e0a324..3ac7fb8dfb766 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -452,7 +452,7 @@ def init_worker(self, *args, **kwargs): self.worker = worker_class(*args, **kwargs) assert self.worker is not None - def execute_method(self, method, *args, **kwargs): + def execute_method(self, method: str, *args, **kwargs): try: target = self if self.worker is None else self.worker executor = getattr(target, method)