diff --git a/.github/mergify.yml b/.github/mergify.yml new file mode 100644 index 0000000000000..1ce5039a061b2 --- /dev/null +++ b/.github/mergify.yml @@ -0,0 +1,58 @@ +pull_request_rules: +- name: label-documentation + description: Automatically apply documentation label + conditions: + - or: + - files~=^[^/]+\.md$ + - files~=^docs/ + actions: + label: + add: + - documentation + +- name: label-ci-build + description: Automatically apply ci/build label + conditions: + - or: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py + actions: + label: + add: + - ci/build + +- name: label-frontend + description: Automatically apply frontend label + conditions: + - files~=^vllm/entrypoints/ + actions: + label: + add: + - frontend + +- name: ping author on conflicts and add 'needs-rebase' label + conditions: + - conflict + - -closed + actions: + label: + add: + - needs-rebase + comment: + message: | + This pull request has merge conflicts that must be resolved before it can be + merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + +- name: remove 'needs-rebase' label when conflict is resolved + conditions: + - -conflict + - -closed + actions: + label: + remove: + - needs-rebase diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 2418c61bdcf63..81e7c9b050760 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -10,6 +10,7 @@ jobs: permissions: issues: write pull-requests: write + actions: write runs-on: ubuntu-latest steps: - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index fc4ac10b7669a..1a6a311e97633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") # @@ -507,7 +507,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f79356bd32f7..b39fd75b5fb70 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! +## License + +See [LICENSE](LICENSE). ## Developing Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. - ## Testing ```bash @@ -33,6 +35,14 @@ pytest tests/ ## Contribution Guidelines +### DCO and Signed-off-by + +When contributing changes to this project, you must agree to the [DCO](DCO). +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the [DCO](DCO). + +Using `-s` with `git commit` will automatically add this header. + ### Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/DCO b/DCO new file mode 100644 index 0000000000000..49b8cb0549267 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile.rocm b/Dockerfile.rocm index d35889f053e27..562117a313020 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install --upgrade pip + # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard diff --git a/README.md b/README.md index 0836d872358fb..b75bfc5c699a7 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,19 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

+--- + +**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo** + +We are excited to announce the last in-person vLLM meetup of the year! +Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist! +Register [here](https://lu.ma/h0qvrajz) and be a part of the event! + +--- + *Latest News* 🔥 -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). @@ -42,7 +52,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 92f6053cc6d7e..7acea6087fdfd 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -3,8 +3,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -16,7 +16,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2ad98b7e2656..8f538c21f7f7e 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -10,7 +10,8 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -88,22 +89,23 @@ def prepare(i: int): input_gating.copy_(gating_output[i]) def run(): - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - override_config=config, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - ) + from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) # JIT compilation & warmup run() @@ -166,7 +168,7 @@ class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - seed_everything(seed) + current_platform.seed_everything(seed) self.seed = seed def benchmark( @@ -180,7 +182,7 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a16: bool, ) -> Tuple[Dict[str, int], float]: - seed_everything(self.seed) + current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 87864d038d593..14eef00b855ac 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -5,8 +5,9 @@ import torch from vllm import _custom_ops as ops +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - create_kv_caches_with_random, seed_everything) + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -28,7 +29,7 @@ def main( device: str = "cuda", kv_cache_dtype: Optional[str] = None, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty(num_seqs, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 743a5744e8614..1d62483448946 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -3,8 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -17,7 +17,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 784b1cf9844e4..250d505168d09 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -6,7 +6,8 @@ from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 24bb7299338ac..40430dae10c5b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - if ("${CUDA_CUDA_LIB}" STREQUAL "") - set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") - endif() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} - ${CUDA_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/csrc/ops.h b/csrc/ops.h index f737f50c2ec96..c50eb39a3dacc 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -5,6 +5,30 @@ #include "core/scalar_type.hpp" +#include + +torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { + // Ensure tensor is on CUDA + if (!tensor.is_cuda()) { + throw std::runtime_error("Tensor must be on CUDA device"); + } + + // Get the raw data pointer + void* data_ptr = tensor.data_ptr(); + + // Get tensor sizes and strides + std::vector sizes = tensor.sizes().vec(); + std::vector strides = tensor.strides().vec(); + + // Get tensor options (dtype, device) + auto options = tensor.options(); + + // Create a new tensor from the raw data pointer + auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options); + + return new_tensor; +} + void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index e704ff629fd6e..b8185c24d5628 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -18,6 +18,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); + ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); + // Attention ops // Compute the attention between an input query and the cached // keys/values using PagedAttention. diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 151ebb5f1811f..b1868acbc84b0 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -60,3 +60,21 @@ Build from source - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. + +Distributed inference and serving +--------------------------------- + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +.. code-block:: console + + $ python -m vllm.entrypoints.openai.api_server \ + $ --model=facebook/opt-13b \ + $ --dtype=bfloat16 \ + $ --device=xpu \ + $ --max_model_len=1024 \ + $ --distributed-executor-backend=ray \ + $ --pipeline-parallel-size=2 \ + $ -tp=8 + +By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index ae09259c0756c..c6d88cc38e99b 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -133,7 +133,9 @@ If you are running api server with :code:`vllm serve `, you can wrap the e from vllm import ModelRegistry from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') + + if __name__ == '__main__': + import runpy + runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') Save the above code in a file and run it with :code:`python your_file.py `. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 413c87ab28755..a1f93a9a28578 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -185,7 +185,9 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + #### Hermes Models (`hermes`) + All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` @@ -197,7 +199,9 @@ step in their creation_. Flags: `--tool-call-parser hermes` + #### Mistral Models (`mistral`) + Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. @@ -216,7 +220,9 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + #### Llama Models (`llama3_json`) + Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` @@ -236,7 +242,9 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` + #### InternLM Models (`internlm`) + Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well @@ -246,6 +254,7 @@ Known issues: Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` + #### Jamba Models (`jamba`) AI21's Jamba-1.5 models are supported. * `ai21labs/AI21-Jamba-1.5-Mini` @@ -255,6 +264,16 @@ AI21's Jamba-1.5 models are supported. Flags: `--tool-call-parser jamba` +#### IBM Granite (`granite-20b-fc`) + +Supported models: +* `ibm-granite/granite-20b-functioncalling` + +Flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +The example chat template deviates slightly from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + + ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -312,5 +331,5 @@ Then you can use this plugin in the command line like this. --tool-parser-plugin --tool-call-parser example \ --chat-template \ -``` +``` diff --git a/examples/tool_chat_template_granite_20b_fc.jinja b/examples/tool_chat_template_granite_20b_fc.jinja new file mode 100644 index 0000000000000..cb52188ec72d9 --- /dev/null +++ b/examples/tool_chat_template_granite_20b_fc.jinja @@ -0,0 +1,130 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + +{%- if not full_function_description is defined %} + {%- set full_function_description = false %} +{%- endif %} + +{%- macro full_description(tool) %} + {{- tool.name + '(' }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- endif %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} +{%- endmacro %} + +{%- macro simple_description(tool) %} + {{- tool.description }} +{%- endmacro %} + +{%- macro function_description(tool) %} + {%- if full_function_description %} + {{- full_description(tool) }} + {%- else %} + {{- simple_description(tool) }} + {%- endif %} +{%- endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set sys_prompt = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %} +{%- endif %} + +{{ 'SYSTEM: ' + sys_prompt }} +{% if tools is iterable and tools | length > 0 %} +<|function_call_library|> + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + function_description(tool) }} + {{- ', "parameters": ' }} + {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +If none of the functions are relevant or the given question lacks the parameters required by the function, please output \" {\"name\": \"no_function\", \"arguments\": {}}\". +{%- endif %} + + + +{% for message in messages %} + {% if message['role'] == 'user' %} + {{- '\nUSER: ' + message['content'] }} + {% elif message['role'] == 'assistant' and message.tool_calls is defined %} + {{- '\nASSISTANT:' }} + {% for tc in message.tool_calls %} + {{- ' ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} + {% endfor %} + {{- '<|endoftext|>' }} + {% elif message['role'] == 'assistant' %} + {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>' }} + {% elif message['role'] == 'tool' %} + {{- ' ' + message['content'] }} + {%- else %} + {{- raise_exception("Unexpected combination of role and message content") }} + {% endif %} + {% if loop.last and add_generation_prompt %} + {{- '\nASSISTANT: ' }} + {% endif %} +{% endfor %} diff --git a/pyproject.toml b/pyproject.toml index e0c56ab79cad0..e78f5652f486b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.4.0", + "torch == 2.5.0", "wheel", "jinja2", ] diff --git a/requirements-build.txt b/requirements-build.txt index 6144a56da8c47..7b16d9778c1a6 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,9 +1,9 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=61 -setuptools-scm>=8 -torch==2.4.0 -wheel -jinja2 +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=61 +setuptools-scm>=8 +torch==2.5.0 +wheel +jinja2 diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3b3c2f876919e..92fa303d687a2 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.4.0 +torch == 2.5.0 # These must be updated alongside torch -torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 +torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index ac54cf0c3288f..7ad0d1e7f704b 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.4.0 # should be aligned with "common" vLLM torch version +torch == 2.5.0 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/requirements-test.in b/requirements-test.in new file mode 100644 index 0000000000000..3881f2566b556 --- /dev/null +++ b/requirements-test.in @@ -0,0 +1,37 @@ +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +einops # required for MPT, qwen-vl and Mamba +httpx +librosa # required for audio tests +opencv-python # required for video tests +peft +requests +ray[adag]==2.35 +sentence-transformers # required for embedding +soundfile # required for audio test +timm # required for internvl test +torch==2.5.0 +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]==0.4.4 # required for model evaluation test + +# TODO: Add this after fully implementing llava(mantis) +# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test + +# Benchmarking +aiohttp + +# quantization +bitsandbytes>=0.44.0 +buildkite-test-collector==0.1.8 + +numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 9787fa2a4a486..c474c2ec34b22 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,34 +1,561 @@ -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard - -# testing utils -awscli -einops # required for MPT, qwen-vl and Mamba -httpx -librosa # required for audio tests -opencv-python # required for video tests -peft -requests -ray[adag]==2.35 -sentence-transformers # required for embedding -soundfile # required for audio test -timm # required for internvl test -transformers_stream_generator # required for qwen-vl test -matplotlib # required for qwen-vl test -datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test - -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - -# Benchmarking -aiohttp - -# quantization -bitsandbytes>=0.44.0 +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements-test.txt requirements-test.in +# +absl-py==2.1.0 + # via rouge-score +accelerate==1.0.1 + # via + # lm-eval + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # -r requirements-test.in + # datasets + # fsspec + # lm-eval +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via httpx +argcomplete==3.5.1 + # via datamodel-code-generator +attrs==24.2.0 + # via + # aiohttp + # jsonlines + # jsonschema + # referencing +audioread==3.0.1 + # via librosa +awscli==1.35.16 + # via -r requirements-test.in +bitsandbytes==0.44.1 + # via -r requirements-test.in +black==24.10.0 + # via datamodel-code-generator +boto3==1.35.50 + # via tensorizer +botocore==1.35.50 + # via + # awscli + # boto3 + # s3transfer buildkite-test-collector==0.1.8 + # via -r requirements-test.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # black + # nltk + # ray +colorama==0.4.6 + # via + # awscli + # sacrebleu + # tqdm-multiprocess +contourpy==1.3.0 + # via matplotlib +cupy-cuda12x==13.3.0 + # via ray +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.26.2 + # via -r requirements-test.in +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +datasets==3.0.2 + # via + # evaluate + # lm-eval +decorator==5.1.1 + # via librosa +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +dnspython==2.7.0 + # via email-validator +docutils==0.16 + # via awscli +einops==0.8.0 + # via -r requirements-test.in +email-validator==2.2.0 + # via pydantic +evaluate==0.4.3 + # via lm-eval +fastrlock==0.8.2 + # via cupy-cuda12x +filelock==3.16.1 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers + # triton +fonttools==4.54.1 + # via matplotlib +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec[http]==2024.9.0 + # via + # datasets + # evaluate + # huggingface-hub + # torch +genson==1.3.0 + # via datamodel-code-generator +h11==0.14.0 + # via httpcore +hiredis==3.0.0 + # via tensorizer +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via -r requirements-test.in +huggingface-hub==0.26.2 + # via + # accelerate + # datasets + # evaluate + # peft + # sentence-transformers + # timm + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # email-validator + # httpx + # requests + # yarl +inflect==5.6.2 + # via datamodel-code-generator +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via + # datamodel-code-generator + # torch +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.4.2 + # via + # librosa + # nltk + # scikit-learn +jsonlines==4.0.0 + # via lm-eval +jsonschema==4.23.0 + # via ray +jsonschema-specifications==2024.10.1 + # via jsonschema +kiwisolver==1.4.7 + # via matplotlib +lazy-loader==0.4 + # via librosa +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements-test.in +llvmlite==0.43.0 + # via numba +lm-eval[api]==0.4.4 + # via -r requirements-test.in +lxml==5.3.0 + # via sacrebleu +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.9.2 + # via -r requirements-test.in +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +more-itertools==10.5.0 + # via lm-eval +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via + # librosa + # ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # datasets + # evaluate +mypy-extensions==1.0.0 + # via black +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via rouge-score +numba==0.60.0 + # via librosa +numexpr==2.10.1 + # via lm-eval +numpy==1.26.4 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # contourpy + # cupy-cuda12x + # datasets + # evaluate + # librosa + # matplotlib + # numba + # numexpr + # opencv-python + # pandas + # peft + # rouge-score + # sacrebleu + # scikit-learn + # scipy + # soxr + # tensorizer + # torchvision + # transformers +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opencv-python==4.10.0.84 + # via -r requirements-test.in +packaging==24.1 + # via + # accelerate + # black + # datamodel-code-generator + # datasets + # evaluate + # huggingface-hub + # lazy-loader + # matplotlib + # peft + # pooch + # pytest + # pytest-rerunfailures + # ray + # transformers + # typepy +pandas==2.2.3 + # via + # datasets + # evaluate +pathspec==0.12.1 + # via black +pathvalidate==3.2.1 + # via pytablewriter +peft==0.13.2 + # via + # -r requirements-test.in + # lm-eval +pillow==11.0.0 + # via + # matplotlib + # sentence-transformers + # torchvision +platformdirs==4.3.6 + # via + # black + # pooch +pluggy==1.5.0 + # via pytest +pooch==1.8.2 + # via librosa +portalocker==2.10.1 + # via sacrebleu +propcache==0.2.0 + # via yarl +protobuf==5.28.3 + # via + # ray + # tensorizer +psutil==6.1.0 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +pyarrow==18.0.0 + # via datasets +pyasn1==0.6.1 + # via rsa +pybind11==2.13.6 + # via lm-eval +pycparser==2.22 + # via cffi +pydantic[email]==2.9.2 + # via datamodel-code-generator +pydantic-core==2.23.4 + # via pydantic +pyparsing==3.2.0 + # via matplotlib +pytablewriter==1.2.0 + # via lm-eval +pytest==8.3.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # pytest-asyncio + # pytest-forked + # pytest-rerunfailures + # pytest-shard +pytest-asyncio==0.24.0 + # via -r requirements-test.in +pytest-forked==1.6.0 + # via -r requirements-test.in +pytest-rerunfailures==14.0 + # via -r requirements-test.in +pytest-shard==0.1.2 + # via -r requirements-test.in +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas + # typepy +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via + # accelerate + # awscli + # datamodel-code-generator + # datasets + # huggingface-hub + # peft + # ray + # timm + # transformers +ray[adag]==2.35.0 + # via -r requirements-test.in +redis==5.2.0 + # via tensorizer +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.9.11 + # via + # nltk + # sacrebleu + # tiktoken + # transformers +requests==2.32.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # datasets + # evaluate + # huggingface-hub + # lm-eval + # pooch + # ray + # tiktoken + # transformers +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.20.0 + # via + # jsonschema + # referencing +rsa==4.7.2 + # via awscli +s3transfer==0.10.3 + # via + # awscli + # boto3 +sacrebleu==2.4.3 + # via lm-eval +safetensors==0.4.5 + # via + # accelerate + # peft + # timm + # transformers +scikit-learn==1.5.2 + # via + # librosa + # lm-eval + # sentence-transformers +scipy==1.13.1 + # via + # librosa + # scikit-learn + # sentence-transformers +sentence-transformers==3.2.1 + # via -r requirements-test.in +six==1.16.0 + # via + # python-dateutil + # rouge-score +sniffio==1.3.1 + # via + # anyio + # httpx +soundfile==0.12.1 + # via + # -r requirements-test.in + # librosa +soxr==0.5.0.post1 + # via librosa +sqlitedict==2.1.0 + # via lm-eval +sympy==1.13.1 + # via torch +tabledata==1.3.3 + # via pytablewriter +tabulate==0.9.0 + # via sacrebleu +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via lm-eval +tensorizer==2.9.0 + # via -r requirements-test.in +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.8.0 + # via lm-eval +timm==1.0.11 + # via -r requirements-test.in +tokenizers==0.20.1 + # via transformers +torch==2.5.0 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # lm-eval + # peft + # sentence-transformers + # tensorizer + # timm + # torchvision +torchvision==0.20.0 + # via timm +tqdm==4.66.6 + # via + # datasets + # evaluate + # huggingface-hub + # lm-eval + # nltk + # peft + # sentence-transformers + # tqdm-multiprocess + # transformers +tqdm-multiprocess==0.0.11 + # via lm-eval +transformers==4.45.2 + # via + # lm-eval + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements-test.in +triton==3.1.0 + # via torch +typepy[datetime]==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # huggingface-hub + # librosa + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==1.26.20 + # via + # botocore + # requests +word2number==1.1 + # via lm-eval +xxhash==3.5.0 + # via + # datasets + # evaluate +yarl==1.17.0 + # via aiohttp +zstandard==0.23.0 + # via lm-eval + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements-xpu.txt b/requirements-xpu.txt index ce83a178c618f..eb76a33dab5c2 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -13,4 +13,4 @@ torch == 2.3.1+cxx11.abi intel-extension-for-pytorch == 2.3.110+xpu oneccl_bind_pt == 2.3.100+xpu -triton-xpu == 3.0.0b2 +triton-xpu == 3.0.0b1 diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 3c2ca1bddd906..79647589d5204 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ import pytest from vllm import LLM -from vllm.utils import is_hip +from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..models.utils import check_outputs_equal @@ -51,7 +51,7 @@ def test_models( enforce_eager: bool, ) -> None: - if backend == "FLASHINFER" and is_hip(): + if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 77c56d91d0a8b..6aa27b24b4a6e 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,11 +13,11 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True), + ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), - ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True), + ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True), # TODO: add multi-modality test for llava ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) ]) diff --git a/tests/compile/utils.py b/tests/compile/utils.py index c69343b51ae02..64fc08e80de3b 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -5,7 +5,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.compilation.levels import CompilationLevel -from vllm.utils import is_hip +from vllm.platforms import current_platform TEST_MODELS = [ ("facebook/opt-125m", {}), @@ -55,7 +55,7 @@ "quantization": "marlin" })) -if not is_hip() and is_quant_method_supported("awq"): +if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" })) diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml index 42f4f6f7bb992..5090e8f357bb8 100644 --- a/tests/data/test_config.yaml +++ b/tests/data/test_config.yaml @@ -1,3 +1,5 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 565dfa01346cc..675a980ab3f3f 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -4,6 +4,12 @@ def test_empty_prompt(): - llm = LLM(model="gpt2") + llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) + + +def test_out_of_vocab_token(): + llm = LLM(model="gpt2", enforce_eager=True) + with pytest.raises(ValueError, match='out of vocabulary'): + llm.generate({"prompt_token_ids": [999999]}) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index f03bdb045f640..c81cfdbbe5cff 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -157,15 +157,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - # Added tokens should not appear in tokenized prompt - assert "vllm" not in completion.choices[0].text + with pytest.raises(openai.BadRequestError, match="out of vocabulary"): + # Added tokens should be rejected by the base model + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 0a573a0066d32..58075f7023821 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -20,3 +20,18 @@ async def test_empty_prompt(): prompt="", max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_out_of_vocab_token_ids(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile('.*out of vocabulary.*')): + await client.completions.create(model=model_name, + prompt=[999999], + max_tokens=5, + temperature=0.0) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 8311a5cb3c2d4..68804d6833c73 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -107,6 +107,42 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -162,6 +198,41 @@ async def test_single_chat_session_image_base64encoded( assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, image_url: str, + base64_encoded_image: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 8f6a54ff5979c..f2358940fc7b8 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -2,12 +2,13 @@ import torch -from vllm.utils import is_hip +from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. ROCM_FP8_MAX = 224.0 -FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \ + else torch.float8_e4m3fn def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: @@ -24,8 +25,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min + qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.max + qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) @@ -66,8 +69,10 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> Tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min + fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.max + fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 0e3d3c3a2e987..057a11746014c 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -8,7 +8,7 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, NewGELU, QuickGELU, SiluAndMul) -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -37,7 +37,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -85,7 +85,7 @@ def test_activation( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 52f1ecd176963..4ecd0fc1a21ad 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,12 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol -if not is_hip(): +if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask @@ -23,8 +24,9 @@ NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} -DTYPES = [torch.half, torch.bfloat16, torch.float - ] if not is_hip() else [torch.half, torch.bfloat16] +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing @@ -114,7 +116,8 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize( - "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"]) + "version", + ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -141,7 +144,7 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -317,8 +320,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -368,7 +371,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif(is_hip(), +@pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @torch.inference_mode() def test_multi_query_kv_attention( @@ -379,7 +382,7 @@ def test_multi_query_kv_attention( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use @@ -425,6 +428,6 @@ def test_multi_query_kv_attention( scale, dtype, ) - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index df3e770e260e0..3fe9ca0b0450f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -25,7 +25,8 @@ def test_env(name: str, device: str, monkeypatch): False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.is_hip", return_value=True): + with patch("vllm.attention.selector.current_platform.is_rocm", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 0f0a2b24563fd..59917dd2c58ad 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -5,11 +5,10 @@ import pytest import torch +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( awq_marlin_quantize) @@ -81,7 +80,7 @@ def test_fused_marlin_moe_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids = fused_topk(a, score, topk, False) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -150,14 +149,14 @@ def test_single_marlin_moe_multiply_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe(a, - qweight, - scales, - score, - topk, - renormalize=False, - w_zeros=zp, - num_bits=num_bits) + marlin_output = torch.ops.vllm.single_marlin_moe(a, + qweight, + scales, + score, + topk, + renormalize=False, + w_zeros=zp, + num_bits=num_bits) torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py index e95e5bd948212..406a0c8dd8080 100644 --- a/tests/kernels/test_awq_triton.py +++ b/tests/kernels/test_awq_triton.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton) -from vllm.utils import seed_everything +from vllm.platforms import current_platform device = "cuda" @@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - seed_everything(0) + current_platform.seed_everything(0) qweight = torch.randint(0, torch.iinfo(torch.int32).max, @@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - seed_everything(0) + current_platform.seed_everything(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index f3bd8f0524264..fb601852dd523 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol @@ -172,7 +173,7 @@ def test_paged_attention( blocksparse_block_size: int, blocksparse_head_sliding_step: int, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -316,8 +317,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -383,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index b0e7097fdfbd4..5b8311a33c361 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -6,7 +6,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -56,7 +56,7 @@ def test_copy_blocks( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -132,7 +132,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -224,7 +224,7 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. @@ -339,7 +339,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) src_device = device if direction[0] == "cuda" else 'cpu' dst_device = device if direction[1] == "cuda" else 'cpu' @@ -408,7 +408,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) low = -224.0 high = 224.0 diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 277d7e4977d73..96bfe06d74ae5 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -9,7 +9,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def causal_conv1d_ref( @@ -70,7 +70,7 @@ def causal_conv1d_update_ref(x, bias: (dim,) cache_seqlens: (batch,), dtype int32. If not None, the conv_state is treated as a circular buffer. - The conv_state will be updated by copying x to the + The conv_state will be updated by copying x to the conv_state starting at the index @cache_seqlens % state_len before performing the convolution. @@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) x = torch.randn(batch, dim, seqlen, device=device, dtype=itype).contiguous() @@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x_ref = x.clone() @@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width, rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 3 padding = 5 if with_padding else 0 @@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) seqlens = [] batch_size = 4 if seqlen < 10: diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 6b979d0558c46..bc99c5559d388 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -18,7 +18,7 @@ from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, global_force_attn_backend_context_manager) -from vllm.utils import is_hip +from vllm.platforms import current_platform # List of support backends for encoder/decoder models LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] @@ -82,7 +82,7 @@ class TestResources(NamedTuple): will leverage attn_backend for the purpose of constructing backend-compatible attention metadata instances - + Attributes: * scale: 1/sqrt(d) scale factor for attn @@ -105,10 +105,10 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: Build key components for performing encoder/decoder attention test. Note that - (1) The Attention instance constructed here, automatically selects + (1) The Attention instance constructed here, automatically selects an attention backend class based on platform info & a set of canned heuristics, so - (2) The attention backend instance constructed here is thus *not + (2) The attention backend instance constructed here is thus *not the same backend instance* used by attn, but rather it is intended to be a *different instance* of the *same backend class*; therefore, @@ -156,7 +156,7 @@ def _encoder_attn_setup( ''' Set up test vectors & data structures for encoder attention test. - A triplet of synthetic query/key/value tensors are constructed. + A triplet of synthetic query/key/value tensors are constructed. Given this is an encoder attention test, the key & value sequences will have the same length as the corresponding queries. @@ -169,14 +169,14 @@ def _encoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field - + Returns: - + * PhaseTestParameters data structure comprising (1) packed query/key/value tensors, (2) the ideal output of attention computed using a naive implementation, and (3) KVCache field set to None @@ -265,7 +265,7 @@ def _decoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -275,14 +275,14 @@ def _decoder_attn_setup( * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size) query/key/value tensors * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) + including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + * Decode-phase decoder self-attention PhaseTestParameters data structure, + including (1) packed (number_of_tokens x num_heads x head_size) + query/key/value tensors along with (2) ideal attention output + computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. * max_block_idx: max physical address in decoder self-attention block-table (intended to be used as the base address for the encoder/ @@ -436,12 +436,12 @@ def _enc_dec_cross_attn_setup_reuses_query( This function also constructs the cross-attention KV cache memory mapping (slot mapping and block table), ensuring that the block table starts at - block_base_addr. + block_base_addr. Arguments: * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; + num_heads x head_size) decoder self-attention inputs; this function relies on the query and q_seq_lens fields * encoder_test_params: PhaseTestParameters data structure which was @@ -452,7 +452,7 @@ def _enc_dec_cross_attn_setup_reuses_query( self-attention; all fields including KV cache required * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -460,16 +460,16 @@ def _enc_dec_cross_attn_setup_reuses_query( Returns: - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed + * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data + structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data + * Decode-phase encoder/decoder cross-attention PhaseTestParameters data structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. ''' @@ -596,7 +596,7 @@ def _run_encoder_attention_test( ''' Run encoder attention. - attn.forward() is passed attn_type=AttentionType.ENCODER in order + attn.forward() is passed attn_type=AttentionType.ENCODER in order to configure the kernel invocation for encoder attention Requires attn_metadata.num_decode_tokens == 0 @@ -607,7 +607,7 @@ def _run_encoder_attention_test( * attn: Attention wrapper instance * encoder_test_params: encoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -646,7 +646,7 @@ def _run_decoder_self_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) @@ -694,11 +694,11 @@ def _run_encoder_decoder_cross_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query field * cross_test_params: encoder/decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -726,7 +726,8 @@ def _run_encoder_decoder_cross_attention_test( attn_type=attn_type) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -755,7 +756,8 @@ def test_encoder_only( No KV cache is required for encoder-only attention. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). This test globally forces an override of the usual backend auto-selection process, forcing the specific backend-under-test @@ -811,7 +813,8 @@ def test_encoder_only( assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -837,14 +840,14 @@ def test_e2e_enc_dec_attn( attributes for prefill-phase, and (2) an analogous attention metadata structure but for decode-phase * Test attention steps in the following order - + * Encoder attention * Prefill self-attention * Prefill cross-attention * Decode self-attention * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention + * Besides being reflective of realistic use-cases, this order would + exacerbate any accidental overlap in the self-/cross-attention block tables, which one hopes to avoid @@ -864,10 +867,11 @@ def test_e2e_enc_dec_attn( to be utilized. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), + all prefill-phase attention operations (encoder, decoder, enc/dec cross), and a single one shared by all decode-phase attention operations (decoder & enc/dec cross.) This is intended to reflect the behavior of EncoderDecoderModelRunner, which constructs a single attention metadata diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 35c29c5bd1028..a20c73345218f 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) @@ -91,7 +91,7 @@ def test_flash_attn_with_paged_kv( sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -161,7 +161,7 @@ def test_varlen_with_paged_kv( num_blocks: int, ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 80a388db6530e..a2c8f71665737 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] HEAD_SIZES = [128, 256] @@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv( soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index c18f5f468dc5a..ebaaae2321885 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -6,7 +6,7 @@ ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_token_quant) from tests.kernels.utils import opcheck -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, @@ -46,7 +46,7 @@ def opcheck_fp8_quant(output, def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 # avoid nans @@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 1513fc196153c..893af99ba4977 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") @@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 41e103e1d09f9..8db6a0d0d9fa4 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -4,7 +4,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, @@ -45,7 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): @torch.inference_mode() def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -68,7 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, @@ -112,7 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -138,7 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float, azp: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 382079d472ee9..9dfa2cbe45e94 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -3,7 +3,7 @@ from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -31,7 +31,7 @@ def test_rms_norm( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index e92d401368a7b..bf7ff3b5c59b8 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -8,7 +8,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def selective_state_update_ref(state, @@ -235,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 dim = 4 dstate = 8 @@ -358,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index c0053071258ea..19c3fc1e1fe3a 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,19 +7,18 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import is_hip, seed_everything @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @@ -103,7 +102,7 @@ def test_mixtral_moe(dtype: torch.dtype): @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) -@pytest.mark.skipif(is_hip(), reason="Skip for rocm") +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_fused_marlin_moe( m: int, n: int, @@ -115,7 +114,7 @@ def test_fused_marlin_moe( num_bits: int, is_k_full: bool, ): - seed_everything(7) + current_platform.seed_everything(7) # Filter act_order if act_order: @@ -192,7 +191,7 @@ def test_fused_marlin_moe( topk, renormalize=False, ) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -256,7 +255,7 @@ def test_fused_marlin_moe( @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) -@pytest.mark.skipif(is_hip(), reason="Skip for rocm") +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_single_marlin_moe_multiply( m: int, n: int, @@ -308,7 +307,7 @@ def test_single_marlin_moe_multiply( sort_indices = stack_and_dev(sort_indices_l) score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe( + marlin_output = torch.ops.vllm.single_marlin_moe( a, qweight, scales, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 94da00915d40e..b408559cc0b07 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,7 +5,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -48,7 +48,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -100,7 +100,7 @@ def test_batched_rotary_embedding( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -160,7 +160,7 @@ def test_batched_rotary_embedding_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3181d92562399..a8a187ebaede4 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -9,7 +9,8 @@ from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, seed_everything +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -39,7 +40,7 @@ def test_contexted_kv_attention( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -234,7 +235,7 @@ def test_contexted_kv_attention_alibi( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index f7c1d4f041c12..15ec66b0f5502 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -4,7 +4,7 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" @@ -31,7 +31,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") +@pytest.mark.xfail(current_platform.is_rocm(), + reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index db877219a285c..eb882faf3974a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .utils import DummyLoRAManager @@ -923,7 +923,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, seq_len) -> None: dtype = torch.float16 seed = 0 - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py index 41c37a4813c68..e756544d96e98 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_sizes.py @@ -1,5 +1,5 @@ """ -This script is mainly used to tests various hidden_sizes. We have collected the +This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. @@ -15,8 +15,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -146,7 +146,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -239,7 +239,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -327,7 +327,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index 185da6399a06a..dc0edeb10ef46 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -1,6 +1,6 @@ """ -This script is mainly used to test whether trtion kernels can run normally -under different conditions, including various batches, numbers of LoRA , and +This script is mainly used to test whether trtion kernels can run normally +under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ from unittest.mock import patch @@ -14,8 +14,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -61,7 +61,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -154,7 +154,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -242,7 +242,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index d004c65929418..5432fa4ad0d3a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -8,7 +8,7 @@ import vllm from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform @dataclass @@ -19,7 +19,7 @@ class ModelWithQuantization: MODELS: List[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. -if is_hip(): +if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index 75625b35209ce..fcfc159e4f5a0 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_outputs_equal +from ...utils import check_logprobs_close, check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -43,18 +43,40 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if model == "openbmb/MiniCPM3-4B": + # the output becomes slightly different when upgrading to + # pytorch 2.5 . Changing to logprobs checks instead of exact + # output checks. + NUM_LOG_PROBS = 8 + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py index fd7af4a8b0b29..38c4a95de16f4 100644 --- a/tests/models/decoder_only/vision_language/test_broadcast.py +++ b/tests/models/decoder_only/vision_language/test_broadcast.py @@ -24,7 +24,7 @@ def test_models(hf_runner, vllm_runner, image_assets, elif model.startswith("llava-hf/llava-v1.6"): from .test_llava_next import models, run_test # type: ignore[no-redef] elif model.startswith("facebook/chameleon"): - if transformers.__version__.startswith("4.46.0"): + if transformers.__version__.startswith("4.46"): pytest.skip("Model broken in HF, " "see huggingface/transformers#34379") from .test_chameleon import models, run_test # type: ignore[no-redef] diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 367f25f446279..1616fd299b9aa 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type, overload +from typing import List, Optional, Tuple, Type import pytest from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, @@ -9,9 +9,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _VideoAssets) -from ....utils import large_gpu_test +from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, + PromptVideoInput, VllmRunner) from ...utils import check_logprobs_close # Video test @@ -20,7 +19,7 @@ "<|im_start|>user\n