diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c4fc43dc0abb8..8c98aa36ac0ff 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -310,13 +310,22 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test # 1h36min +- label: Decoder-only Language Models Test (Standard) # 35min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language + - pytest -v -s models/decoder_only/language/test_models.py + - pytest -v -s models/decoder_only/language/test_big_models.py + +- label: Decoder-only Language Models Test (Extended) # 1h20min + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + commands: + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py - label: Decoder-only Multi-Modal Models Test # 1h31min #mirror_hardwares: [amd] diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 2a0e3239f58da..b80749aaa8fec 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -34,4 +34,5 @@ jobs: - name: "Run actionlint" run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" tools/actionlint.sh -color diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 2e7c7f7f087af..c9d6d4259df99 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 064af291009fa..68d60d7365ed1 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -38,4 +38,4 @@ jobs: ) find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror \ No newline at end of file + | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000000000..f048fce528941 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 0000000000000..f6d4479ee1996 --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e3564779ad9..5f1e5f8eeaf7d 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -32,4 +32,5 @@ jobs: pip install types-setuptools - name: Mypy run: | - tools/mypy.sh + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 96549b3f99181..f959a1cacf866 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Extract branch info shell: bash @@ -30,7 +30,7 @@ jobs: - name: Create Release id: create_release - uses: "actions/github-script@v7" + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: RELEASE_TAG: ${{ env.release_tag }} with: @@ -54,10 +54,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Setup ccache - uses: hendrikmuhs/ccache-action@v1.2 + uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 with: create-symlink: true key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} @@ -92,7 +92,7 @@ jobs: echo "asset_name=${asset_name}" >> "$GITHUB_ENV" - name: Upload Release Asset - uses: actions/upload-release-asset@v1 + uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index d1791c3bc865a..df62539c0b3d9 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | github.rest.issues.createComment({ diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index be73fb85ed1fa..9cc8a9e914474 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -28,7 +28,8 @@ jobs: pip install -r requirements-lint.txt - name: Analysing the code with ruff run: | - ruff check . + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . - name: Spelling check with codespell run: | codespell --toml pyproject.toml diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index eb728ae04dfc1..9f06b35c19e32 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -16,9 +16,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c1a396c81f666..0d205014b15bf 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -53,6 +53,8 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + @dataclass class BenchmarkMetrics: @@ -60,6 +62,7 @@ class BenchmarkMetrics: total_input: int total_output: int request_throughput: float + request_goodput: float output_throughput: float total_token_throughput: float mean_ttft_ms: float @@ -202,6 +205,7 @@ def sample_hf_requests( dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, + random_seed: int, fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: dataset = load_dataset(dataset_path, @@ -210,8 +214,8 @@ def sample_hf_requests( streaming=True) assert "conversations" in dataset.features, ( "HF Dataset must have 'conversations' column.") - filtered_dataset = dataset.shuffle().filter( - lambda x: len(x["conversations"]) >= 2) + filter_func = lambda x: len(x["conversations"]) >= 2 + filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = [] for data in filtered_dataset: @@ -315,12 +319,15 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], + gootput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 completed = 0 + good_completed = 0 itls: List[float] = [] tpots: List[float] = [] + all_tpots: List[float] = [] ttfts: List[float] = [] e2els: List[float] = [] for i in range(len(outputs)): @@ -334,9 +341,13 @@ def calculate_metrics( add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] + tpot = 0 if output_len > 1: - tpots.append( - (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) itls += outputs[i].itl ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) @@ -344,6 +355,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if gootput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in gootput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(gootput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in gootput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(gootput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in gootput_config_dict: + valid_metrics.append(e2els) + slo_values.append(gootput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -354,6 +387,7 @@ def calculate_metrics( total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * @@ -397,6 +431,8 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, + gootput_config_dict: Dict[str, float], + max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -445,9 +481,25 @@ async def benchmark( print("Profiler started") print(f"Traffic request rate: {request_rate}") + print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): @@ -463,8 +515,8 @@ async def benchmark( ignore_eos=ignore_eos) tasks.append( asyncio.create_task( - request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -494,6 +546,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + gootput_config_dict=gootput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -505,6 +558,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if gootput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -516,6 +572,8 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, + "request_goodput:": + metrics.request_goodput if gootput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -569,6 +627,41 @@ def process_one_metric( return result +def check_goodput_args(args): + # Check and parse goodput arguments + gootput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + gootput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in gootput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return gootput_config_dict + + +def parse_goodput(slo_pairs): + gootput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + gootput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return gootput_config_dict + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -646,6 +739,7 @@ def main(args: argparse.Namespace): dataset_split=args.hf_split, num_requests=args.num_prompts, tokenizer=tokenizer, + random_seed=args.seed, fixed_output_len=args.hf_output_len, ) @@ -662,6 +756,8 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") + gootput_config_dict = check_goodput_args(args) + benchmark_result = asyncio.run( benchmark( backend=backend, @@ -680,6 +776,8 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, + gootput_config_dict=gootput_config_dict, + max_concurrency=args.max_concurrency, )) # Save config and results to json @@ -709,13 +807,16 @@ def main(args: argparse.Namespace): # Traffic result_json["request_rate"] = ( args.request_rate if args.request_rate < float("inf") else "inf") + result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result result_json = {**result_json, **benchmark_result} # Save to file base_model_id = model_id.split("/")[-1] - file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa if args.result_filename: file_name = args.result_filename if args.result_dir: @@ -766,6 +867,19 @@ def main(args: argparse.Namespace): default=None, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( "--model", type=str, @@ -879,6 +993,17 @@ def main(args: argparse.Namespace): "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index b5fa83b437ac4..62ab8c067f5d0 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -87,6 +87,11 @@ Text Generation - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - ✅︎ + * - :code:`FalconMambaForCausalLM` + - FalconMamba + - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. @@ -294,6 +299,10 @@ Text Embedding - - ✅︎ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + Reward Modeling --------------- @@ -433,7 +442,7 @@ Text Generation * - :code:`PixtralForConditionalGeneration` - Pixtral - T + I\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409` + - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - ✅︎ * - :code:`QWenLMHeadModel` @@ -482,6 +491,10 @@ Multimodal Embedding - 🚧 - ✅︎ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + ---- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 7dd42ec1bb9c9..a7b55d1c0c1ff 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc .. code-block:: bash - vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 + vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 .. important:: Since OpenAI Vision API is based on `Chat Completions `_ API, diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 9132e12a36ba5..cc8e539a8a6d3 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -157,7 +157,7 @@ vLLM will use guided decoding to ensure the response matches the tool parameter To enable this feature, you should set the following flags: * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral` or `llama3_json` or `internlm`. Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages @@ -168,7 +168,7 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! -#### Hermes Models +#### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` @@ -180,7 +180,7 @@ step in their creation_. Flags: `--tool-call-parser hermes` -#### Mistral Models +#### Mistral Models (`mistral`) Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. @@ -199,7 +199,7 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` -#### Llama Models +#### Llama Models (`llama3_json`) Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` @@ -219,16 +219,24 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` -#### Internlm Models +#### InternLM Models (`internlm`) Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: -* Although this implementation also supports Internlm2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. +* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` +#### Jamba Models (`jamba`) +AI21's Jamba-1.5 models are supported. +* `ai21labs/AI21-Jamba-1.5-Mini` +* `ai21labs/AI21-Jamba-1.5-Large` + + +Flags: `--tool-call-parser jamba` + ### How to write a tool parser plugin diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4c88dcc2f087b..06b424abd50b5 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -277,6 +277,22 @@ def run_qwen2_vl(question: str, modality: str): return llm, prompt, stop_token_ids +# Pixtral HF-format +def run_pixtral_hf(question: str, modality: str): + assert modality == "image" + + model_name = "mistral-community/pixtral-12b" + + llm = LLM( + model=model_name, + max_model_len=8192, + ) + + prompt = f"[INST]{question}\n[IMG][/INST]" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # LLama 3.2 def run_mllama(question: str, modality: str): assert modality == "image" @@ -347,6 +363,7 @@ def run_glm4v(question: str, modality: str): "NVLM_D": run_nvlm_d, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, + "pixtral_hf": run_pixtral_hf, "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index 8e62199e1db7b..cfedd145a015d 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -7,6 +7,7 @@ # Create an LLM. llm = LLM( model="TIGER-Lab/VLM2Vec-Full", + task="embedding", trust_remote_code=True, max_model_len=4096, max_num_seqs=2, diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 3b3e0ae64a037..67b755a155966 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,4 +1,5 @@ from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory # NOTE: This is just a running example. For benchmarking purpose, # please see benchmarks/benchmark_prefix_caching.py @@ -28,12 +29,9 @@ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) -# Create an LLM. +# Create an LLM without prefix caching as a baseline. regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) print("Results without `enable_prefix_caching`") # Generate texts from the prompts. The output is a list of RequestOutput objects @@ -50,6 +48,15 @@ print("-" * 80) +# Destroy the LLM object and free up the GPU memory. +del regular_llm +cleanup_dist_env_and_memory() + +# Create an LLM with prefix caching enabled. +prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) + # Warmup so that the shared prompt's KV cache is computed. prefix_cached_llm.generate(generating_prompts[0], sampling_params) diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py index 704236be72d03..beb83e494ed0b 100644 --- a/examples/openai_api_client_for_multimodal.py +++ b/examples/openai_api_client_for_multimodal.py @@ -7,8 +7,8 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja (multi-image inference with Phi-3.5-vision-instruct) -vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 (audio inference with Ultravox) vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 diff --git a/requirements-common.txt b/requirements-common.txt index ca09f9d35909e..d72cc44762720 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,4 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.6.0 # required for compressed-tensors +compressed-tensors == 0.7.1 # required for compressed-tensors diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1903a7582dc89..8a04693ba676d 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -12,11 +12,11 @@ from vllm import SamplingParams from vllm.config import ParallelConfig +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.outputs import RequestOutput as RealRequestOutput from vllm.sampling_params import RequestOutputKind -from ..conftest import cleanup from ..utils import wait_for_gpu_memory_to_clear @@ -157,7 +157,7 @@ async def async_engine(): engine.shutdown_background_loop() del engine await asyncio.sleep(0.1) - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture() diff --git a/tests/conftest.py b/tests/conftest.py index 5df7da9ee64e2..4c9180415da32 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,3 @@ -import contextlib -import gc import json import os import sys @@ -25,10 +23,9 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import TokenizerPoolConfig +from vllm.config import TaskOption, TokenizerPoolConfig from vllm.connections import global_http_connection -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, @@ -140,17 +137,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() - - -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - if not is_cpu(): - torch.cuda.empty_cache() + cleanup_dist_env_and_memory() @pytest.fixture() @@ -167,7 +154,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(autouse=True) @@ -606,7 +593,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") @@ -619,6 +606,7 @@ class VllmRunner: def __init__( self, model_name: str, + task: TaskOption = "auto", tokenizer_name: Optional[str] = None, # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. @@ -634,6 +622,7 @@ def __init__( ) -> None: self.model = LLM( model=model_name, + task=task, tokenizer=tokenizer_name, trust_remote_code=True, dtype=dtype, @@ -859,7 +848,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index e870597b7a011..70577ec052a2c 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -3,10 +3,9 @@ import pytest from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.utils import set_random_seed -from ....conftest import cleanup - @pytest.fixture def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, @@ -37,7 +36,7 @@ def generator_inner(): yield llm del llm - cleanup() + cleanup_dist_env_and_memory() for llm in generator_inner(): yield llm diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index f97caa06ff02d..308dad1850c9a 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -33,7 +33,8 @@ def test_simple(): num_seq_group = 4 max_model_len = 16 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, num_seq_group, max_model_len, enable_chunked_prefill=True) @@ -78,6 +79,7 @@ def test_chunk(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -126,6 +128,7 @@ def test_complex(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -196,6 +199,7 @@ def test_maximal_decoding(): max_model_len = 8 max_num_batched_tokens = 2 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -289,6 +293,7 @@ def test_prompt_limit(): max_model_len = 64 max_num_batched_tokens = 32 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -321,7 +326,8 @@ def test_prompt_limit_exceed(): max_seqs = 64 max_model_len = 32 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True) @@ -348,6 +354,7 @@ def test_swap(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -498,6 +506,7 @@ def test_chunked_prefill_preempt(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -617,6 +627,7 @@ def test_perfix_caching(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index defa6c1bdaf78..00b6349b9f8c5 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -20,9 +20,10 @@ def test_scheduler_add_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 1, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, ) cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config.num_cpu_blocks = 4 @@ -42,9 +43,10 @@ def test_scheduler_add_seq_group(): def test_scheduler_abort_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 1, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 4 @@ -70,9 +72,10 @@ def test_scheduler_schedule_simple(): num_seq_group = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - num_seq_group, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 @@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized(): max_model_len = 30 max_batched_num_tokens = 30 scheduler_config = SchedulerConfig( - max_batched_num_tokens, - 2, - max_model_len, + "generate", + max_num_batched_tokens=max_batched_num_tokens, + max_num_seqs=2, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 @@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort(): block_size = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - 2, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=2, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 2 @@ -204,9 +209,10 @@ def test_scheduler_max_seqs(): max_seq_group = 2 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - max_seq_group, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=max_seq_group, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 @@ -248,9 +254,10 @@ def test_scheduler_max_seqs(): def test_scheduler_delay_factor(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 16, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=16, delay_factor=0.5, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") @@ -350,9 +357,10 @@ def initialize_scheduler( ): block_size = block_size scheduler_config = SchedulerConfig( - max_token_budget, - max_num_seqs, - max_model_len, + "generate", + max_num_batched_tokens=max_token_budget, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 50c047f30b80d..7cd0416d321ef 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder(): block_size = 4 num_seq_group = 4 max_model_len = 16 - scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) + scheduler_config = SchedulerConfig( + task="generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 88d0a4ba7f57b..fee201850f203 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -11,6 +11,7 @@ import pytest +from vllm.config import TaskOption from vllm.logger import init_logger from ..utils import compare_two_settings, fork_new_process_for_each_test @@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple): class PPTestSettings: parallel_setups: List[ParallelSetup] distributed_backends: List[str] + task: TaskOption trust_remote_code: bool tokenizer_mode: Optional[str] @@ -39,6 +41,7 @@ def detailed( *, tp_base: int = 1, pp_base: int = 2, + task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -66,6 +69,7 @@ def detailed( chunked_prefill=False), ], distributed_backends=["mp", "ray"], + task=task, trust_remote_code=trust_remote_code, tokenizer_mode=tokenizer_mode, ) @@ -75,6 +79,7 @@ def fast( *, tp_base: int = 1, pp_base: int = 2, + task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -86,6 +91,7 @@ def fast( chunked_prefill=False), ], distributed_backends=["mp"], + task=task, trust_remote_code=trust_remote_code, tokenizer_mode=tokenizer_mode, ) @@ -94,7 +100,7 @@ def iter_params(self, model_name: str): for parallel_setup in self.parallel_setups: for distributed_backend in self.distributed_backends: yield (model_name, parallel_setup, distributed_backend, - self.trust_remote_code, self.tokenizer_mode) + self.task, self.trust_remote_code, self.tokenizer_mode) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU @@ -213,6 +219,7 @@ def _compare_tp( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, + task: TaskOption, trust_remote_code: bool, tokenizer_mode: Optional[str], num_gpus_available: int, @@ -240,6 +247,8 @@ def _compare_tp( common_args.append("--enable-chunked-prefill") if eager_mode: common_args.append("--enforce-eager") + if task != "auto": + common_args.extend(["--task", task]) if trust_remote_code: common_args.append("--trust-remote-code") if tokenizer_mode: @@ -297,7 +306,7 @@ def _compare_tp( @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", + ("model_name", "parallel_setup", "distributed_backend", "task", "trust_remote_code", "tokenizer_mode"), [ params for model_name, settings in GENERATION_MODEL_SETTINGS.items() @@ -310,6 +319,7 @@ def test_tp_language_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, + task: TaskOption, trust_remote_code: bool, tokenizer_mode: Optional[str], num_gpus_available, @@ -317,6 +327,7 @@ def test_tp_language_generation( _compare_tp(model_name, parallel_setup, distributed_backend, + task, trust_remote_code, tokenizer_mode, num_gpus_available, @@ -324,7 +335,7 @@ def test_tp_language_generation( @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", + ("model_name", "parallel_setup", "distributed_backend", "task", "trust_remote_code", "tokenizer_mode"), [ params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() @@ -337,6 +348,7 @@ def test_tp_language_embedding( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, + task: TaskOption, trust_remote_code: bool, tokenizer_mode: Optional[str], num_gpus_available, @@ -344,6 +356,7 @@ def test_tp_language_embedding( _compare_tp(model_name, parallel_setup, distributed_backend, + task, trust_remote_code, tokenizer_mode, num_gpus_available, @@ -351,7 +364,7 @@ def test_tp_language_embedding( @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", + ("model_name", "parallel_setup", "distributed_backend", "task", "trust_remote_code", "tokenizer_mode"), [ params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() @@ -364,6 +377,7 @@ def test_tp_multimodal_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, + task: TaskOption, trust_remote_code: bool, tokenizer_mode: Optional[str], num_gpus_available, @@ -371,6 +385,7 @@ def test_tp_multimodal_generation( _compare_tp(model_name, parallel_setup, distributed_backend, + task, trust_remote_code, tokenizer_mode, num_gpus_available, diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py new file mode 100644 index 0000000000000..b57348a4d9a58 --- /dev/null +++ b/tests/entrypoints/llm/test_chat.py @@ -0,0 +1,92 @@ +from typing import List + +import pytest + +from vllm import LLM + +from ..openai.test_vision import TEST_IMAGE_URLS + + +def test_chat(): + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + + prompt1 = "Explain the concept of entropy." + messages = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + outputs = llm.chat(messages) + assert len(outputs) == 1 + + +def test_multi_chat(): + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + + prompt1 = "Explain the concept of entropy." + prompt2 = "Explain what among us is." + + conversation1 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + + conversation2 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt2 + }, + ] + + messages = [conversation1, conversation2] + + outputs = llm.chat(messages) + assert len(outputs) == 2 + + +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) +def test_chat_multi_image(image_urls: List[str]): + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + dtype="bfloat16", + max_model_len=4096, + max_num_seqs=5, + enforce_eager=True, + trust_remote_code=True, + limit_mm_per_prompt={"image": 2}, + ) + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "image_url", + "image_url": { + "url": image_url + } + } for image_url in image_urls), + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + outputs = llm.chat(messages) + assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 1885f2e168d80..4c9f796e5ed71 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -4,8 +4,7 @@ import pytest from vllm import LLM, EmbeddingRequestOutput, PoolingParams - -from ...conftest import cleanup +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -41,7 +40,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[EmbeddingRequestOutput], diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 6543c4bb1b58e..7d2b377752725 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -4,9 +4,7 @@ import pytest from vllm import LLM, RequestOutput, SamplingParams - -from ...conftest import cleanup -from ..openai.test_vision import TEST_IMAGE_URLS +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "facebook/opt-125m" @@ -40,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): @@ -104,90 +102,3 @@ def test_multiple_sampling_params(llm: LLM): # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) - - -def test_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - outputs = llm.chat(messages) - assert len(outputs) == 1 - - -def test_multi_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - prompt2 = "Explain what among us is." - - conversation1 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - - conversation2 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt2 - }, - ] - - messages = [conversation1, conversation2] - - outputs = llm.chat(messages) - assert len(outputs) == 2 - - -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: List[str]): - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - dtype="bfloat16", - max_model_len=4096, - max_num_seqs=5, - enforce_eager=True, - trust_remote_code=True, - limit_mm_per_prompt={"image": 2}, - ) - - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] - outputs = llm.chat(messages) - assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 9f5727ecd0406..eb2113692e7b4 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -5,10 +5,9 @@ from huggingface_hub import snapshot_download from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" PROMPTS = [ @@ -39,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="module") diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 2841dfc6bd9c2..67c79415f322a 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -5,12 +5,11 @@ import jsonschema import pytest +from vllm.distributed import cleanup_dist_env_and_memory from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -23,7 +22,7 @@ def llm(): with llm.deprecate_legacy_api(): yield weakref.proxy(llm) del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.skip_global_cleanup diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py new file mode 100644 index 0000000000000..c9a4ad44fea30 --- /dev/null +++ b/tests/entrypoints/llm/test_init.py @@ -0,0 +1,22 @@ +import pytest + +from vllm import LLM + +from ...utils import error_on_warning + +MODEL_NAME = "facebook/opt-125m" + + +def test_pos_args_deprecated(): + with error_on_warning(DeprecationWarning): + LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) + + with error_on_warning(DeprecationWarning): + LLM(MODEL_NAME, tokenizer=MODEL_NAME) + + with pytest.warns(DeprecationWarning, match="'tokenizer'"): + LLM(MODEL_NAME, MODEL_NAME) + + with pytest.warns(DeprecationWarning, + match="'tokenizer', 'tokenizer_mode'"): + LLM(MODEL_NAME, MODEL_NAME, "auto") diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 39480531f5866..cbfb0cc32c1ce 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,6 +1,7 @@ import sys from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory def test_lazy_outlines(sample_regex): @@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex): ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Create an LLM without guided decoding as a baseline. llm = LLM(model="facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.3) @@ -26,10 +28,15 @@ def test_lazy_outlines(sample_regex): # make sure outlines is not imported assert 'outlines' not in sys.modules + # Destroy the LLM object and free up the GPU memory. + del llm + cleanup_dist_env_and_memory() + + # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", - gpu_memory_utilization=0.3) + gpu_memory_utilization=0.6) sampling_params = SamplingParams(temperature=0.8, top_p=0.95) outputs = llm.generate( prompts=[ diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 0b6026a89c758..65699e609e4a8 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,51 +1,56 @@ """Tests for HF_HUB_OFFLINE mode""" import importlib import sys -import weakref import pytest from vllm import LLM - -from ...conftest import cleanup - -MODEL_NAME = "facebook/opt-125m" +from vllm.distributed import cleanup_dist_env_and_memory + +MODEL_CONFIGS = [ + { + "model": "facebook/opt-125m", + "enforce_eager": True, + "gpu_memory_utilization": 0.20, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + }, + { + "model": "mistralai/Mistral-7B-Instruct-v0.1", + "enforce_eager": True, + "gpu_memory_utilization": 0.95, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + "tokenizer_mode": "mistral", + }, +] @pytest.fixture(scope="module") -def llm(): - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) - - with llm.deprecate_legacy_api(): - yield weakref.proxy(llm) +def cache_models(): + # Cache model files first + for model_config in MODEL_CONFIGS: + LLM(**model_config) + cleanup_dist_env_and_memory() - del llm - - cleanup() + yield @pytest.mark.skip_global_cleanup -def test_offline_mode(llm: LLM, monkeypatch): - # we use the llm fixture to ensure the model files are in-cache - del llm - +@pytest.mark.usefixtures("cache_models") +def test_offline_mode(monkeypatch): # Set HF to offline mode and ensure we can still construct an LLM try: monkeypatch.setenv("HF_HUB_OFFLINE", "1") # Need to re-import huggingface_hub and friends to setup offline mode _re_import_modules() # Cached model files should be used in offline mode - LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) + for model_config in MODEL_CONFIGS: + LLM(**model_config) finally: # Reset the environment after the test # NB: Assuming tests are run in online mode diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 3af0032fd2fb0..a29747603622b 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -851,14 +851,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_response_format_json_schema(client: openai.AsyncOpenAI): + prompt = 'what is 1+1? The format is "result": 2' + # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ - "role": - "user", - "content": ('what is 1+1? please respond with a JSON object, ' - 'the format is {"result": 2}') + "role": "user", + "content": prompt + }], + ) + content = resp.choices[0].message.content + assert content is not None + with pytest.raises((json.JSONDecodeError, AssertionError)): + loaded = json.loads(content) + assert loaded == {"result": 2}, loaded + + for _ in range(2): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": prompt }], response_format={ "type": "json_schema", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ec550fe82c70f..d9342fad9f018 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -22,12 +22,12 @@ class MockHFConfig: @dataclass class MockModelConfig: + task = "generate" tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" max_model_len = 100 tokenizer_revision = None - embedding_mode = False multimodal_config = MultiModalConfig() hf_config = MockHFConfig() diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 81d79601124a7..8311a5cb3c2d4 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -23,6 +23,8 @@ @pytest.fixture(scope="module") def server(): args = [ + "--task", + "generate", "--dtype", "bfloat16", "--max-model-len", diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 6ded5102c9314..1d8c328b73259 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -18,7 +18,8 @@ @pytest.fixture(scope="module") def phi3v_model_config(): return ModelConfig(PHI3V_MODEL_ID, - PHI3V_MODEL_ID, + task="generate", + tokenizer=PHI3V_MODEL_ID, tokenizer_mode="auto", trust_remote_code=True, dtype="bfloat16", @@ -387,3 +388,29 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( "text": "What about these two?" }] }], phi3v_model_config, phi3v_tokenizer) + + +def test_parse_chat_messages_multiple_images_uncommon_input( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?" + }] + _assert_mm_data_is_image_input(mm_data, 2) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index f471dcee938be..5671207ac847e 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -20,21 +20,21 @@ def test_env(name: str, device: str, monkeypatch): if device == "cpu": with patch("vllm.attention.selector.is_cpu", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.is_hip", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.is_openvino", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == name @@ -46,37 +46,32 @@ def test_flash_attn(monkeypatch): # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False) + backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False) + backend = which_attn_to_use(16, torch.float16, "fp8", 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(16, None, torch.float16, None, 8, False) - assert backend.name != STR_FLASH_ATTN_VAL - - # Unsupported sliding window - backend = which_attn_to_use(16, 1, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 8, False) assert backend.name != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(17, None, torch.float16, None, 16, False) + backend = which_attn_to_use(17, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, - True) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True) assert backend.name != STR_FLASH_ATTN_VAL @@ -84,4 +79,4 @@ def test_invalid_env(monkeypatch): """Throw an exception if the backend name is invalid.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with pytest.raises(ValueError): - which_attn_to_use(16, None, torch.float16, None, 16, False) + which_attn_to_use(16, torch.float16, None, 16, False) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 3e9b4d9a4f8a0..35c29c5bd1028 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -78,6 +78,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("sliding_window", [None, 256]) @torch.inference_mode() def test_flash_attn_with_paged_kv( kv_lens: List[int], @@ -87,6 +88,7 @@ def test_flash_attn_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") seed_everything(0) @@ -96,6 +98,8 @@ def test_flash_attn_with_paged_kv( assert num_query_heads % num_kv_heads == 0 max_kv_len = max(kv_lens) scale = head_size**-0.5 + window_size = ((sliding_window - 1, 0) if sliding_window is not None else + (-1, -1)) query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) key_cache = torch.randn(num_blocks, @@ -121,18 +125,18 @@ def test_flash_attn_with_paged_kv( block_table=block_tables, cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, + window_size=window_size, ).squeeze(1) - ref_output = ref_paged_attn( - query=query, - key_cache=key_cache, - value_cache=value_cache, - query_lens=[1] * num_seqs, - kv_lens=kv_lens, - block_tables=block_tables, - scale=scale, - soft_cap=soft_cap, - ) + ref_output = ref_paged_attn(query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=[1] * num_seqs, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -141,7 +145,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None]) +@pytest.mark.parametrize("sliding_window", [None, 256]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @@ -166,8 +170,7 @@ def test_varlen_with_paged_kv( assert num_query_heads % num_kv_heads == 0 max_query_len = max(query_lens) max_kv_len = max(kv_lens) - window_size = ((sliding_window, - sliding_window) if sliding_window is not None else + window_size = ((sliding_window - 1, 0) if sliding_window is not None else (-1, -1)) scale = head_size**-0.5 diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 405c0d0efad65..e40f0dd74602e 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,20 +1,16 @@ -import contextlib -import gc import tempfile from collections import OrderedDict from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest -import ray import torch import torch.nn as nn from huggingface_hub import snapshot_download import vllm from vllm.config import LoRAConfig -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -48,16 +44,6 @@ class ContextInfo(TypedDict): }] -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - torch.cuda.empty_cache() - ray.shutdown() - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. @@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -87,7 +73,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -238,7 +224,7 @@ def long_context_lora_files_32k(): def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] @@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture def llama_2_7b_engine_extra_embeddings(): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) get_model_old = get_model def get_model_patched(*, model_config, device_config, **kwargs): @@ -272,7 +258,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) yield engine.llm_engine del engine - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index cbc3668997817..0ba2ce3617b67 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -3,10 +3,9 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 @@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index ad8490353998f..e2a4f1ed0496a 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -4,10 +4,9 @@ import ray import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 5636c96435024..d004c65929418 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -6,11 +6,10 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest from vllm.utils import is_hip -from .conftest import cleanup - @dataclass class ModelWithQuantization: @@ -160,7 +159,7 @@ def expect_match(output, expected_output): print("removing lora") del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.parametrize("model", MODELS) @@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM( model=model.model_path, @@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..2f7ac85507425 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files): worker = Worker( model_config=ModelConfig( "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-2-7b-hf", + task="auto", + tokenizer="meta-llama/Llama-2-7b-hf", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files): load_format="dummy", ), parallel_config=ParallelConfig(1, 1, False), - scheduler_config=SchedulerConfig(32, 32, 32), + scheduler_config=SchedulerConfig("generate", 32, 32, 32), device_config=DeviceConfig("cuda"), cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 8798ff078843a..92e6086e312f7 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -6,13 +6,12 @@ from prometheus_client import REGISTRY from vllm import EngineArgs, LLMEngine +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams -from ..conftest import cleanup - MODELS = [ "facebook/opt-125m", ] @@ -307,7 +306,7 @@ def test_metric_spec_decode_interval( finally: del engine - cleanup() + cleanup_dist_env_and_memory() def assert_metrics(engine: LLMEngine, disable_log_stats: bool, diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcc158639748d..75625b35209ce 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -21,10 +21,14 @@ ] if not current_platform.is_cpu(): - # MiniCPM requires fused_moe which is not supported by CPU - MODELS.append("openbmb/MiniCPM3-4B") + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + # Head size isn't supported on CPU + "h2oai/h2o-danube3-4b-base", + ] -#TODO: remove this after CPU float16 support ready +# TODO: remove this after CPU float16 support ready target_dtype = "float" if current_platform.is_cpu() else "half" diff --git a/tests/models/decoder_only/language/test_danube3_4b.py b/tests/models/decoder_only/language/test_danube3_4b.py deleted file mode 100644 index bdd498edc293d..0000000000000 --- a/tests/models/decoder_only/language/test_danube3_4b.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests danube3 separately because its head size isn't supported on CPU yet. - -Run `pytest tests/models/test_danube3_4b.py`. -""" -import pytest - -from ...utils import check_outputs_equal - -MODELS = ["h2oai/h2o-danube3-4b-base"] - -target_dtype = "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index c27bf6a60a4f4..2dc231c595ffa 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -10,7 +10,7 @@ from ...utils import check_outputs_equal -MODELS = ["state-spaces/mamba-130m-hf"] +MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"] # Use lower-level interfaces to create this greedy generator, as mamba will diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 3c3b95b38baac..98f313eb9b9af 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from ....conftest import _ImageAssets, cleanup +from ....conftest import _ImageAssets # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner @@ -45,12 +45,13 @@ def run_intern_vit_test( for pixel_value in pixel_values ] + from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) del hf_model - cleanup() + cleanup_dist_env_and_memory() vllm_model = vllm_model.to("cuda", dtype) vllm_outputs_per_image = [ @@ -58,7 +59,7 @@ def run_intern_vit_test( for pixel_value in pixel_values ] del vllm_model - cleanup() + cleanup_dist_env_and_memory() cos_similar = nn.CosineSimilarity(dim=-1) for vllm_output, hf_output in zip(vllm_outputs_per_image, diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 12e8a961877cd..808421abd9103 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -89,6 +89,7 @@ def run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, + task="generate", max_model_len=4096, max_num_seqs=2, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 5f704d854e5dc..39b6bbaf43180 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -6,21 +6,31 @@ from ..utils import check_embeddings_close +# Model, Guard MODELS = [ "intfloat/e5-mistral-7b-instruct", + "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2", ] +ENCODER_ONLY = [ + "BAAI/bge-base-en-v1.5", +] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models( + monkeypatch, hf_runner, vllm_runner, example_prompts, - model: str, + model, dtype: str, ) -> None: + if model in ENCODER_ONLY: + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: @@ -33,7 +43,7 @@ def test_models( is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) check_embeddings_close( diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index ea6b56cd02625..0ca90e6bfa52e 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -28,6 +28,7 @@ def test_models( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, + task="embedding", max_model_len=4096, max_num_seqs=2, dtype=dtype, diff --git a/tests/models/utils.py b/tests/models/utils.py index 86a624483c58a..2ea233a9a599c 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -3,7 +3,7 @@ import torch -from vllm.config import ModelConfig +from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from vllm.utils import is_cpu @@ -248,6 +248,7 @@ def check_logprobs_close( def build_model_context(model_name: str, + task: TaskOption = "auto", tokenizer_name: Optional[str] = None, trust_remote_code: bool = False, dtype: Optional[Union[str, torch.dtype]] = None, @@ -273,7 +274,8 @@ def build_model_context(model_name: str, model_config = ModelConfig( model_name, - tokenizer_name, + task=task, + tokenizer=tokenizer_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, dtype=dtype, diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 7d09b81060efd..13ad4a7966b9d 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index 7b9e0b6e5234b..5044740c3e734 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -221,6 +221,7 @@ def test_max_tokens_kwarg_overrides(num_crops): expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) @@ -256,6 +257,7 @@ def test_max_tokens_kwarg_overrides(num_crops): def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): """Ensure that max token calcs filters out invalid mm_processor_kwargs""" ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) @@ -278,12 +280,13 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): ### Test overrides for the mapper @pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE]) -def test_default_mapper_with_processer_kwargs(image_assets, num_crops): +def test_default_mapper_with_processor_kwargs(image_assets, num_crops): """Ensure that the mapper processor kwargs can fall back to HF models.""" # NOTE - we don't validate bad inputs for the default mapper, because it's # through the automodel interface in transformers, so we can't easily # inspect what kwargs are or are not allowed. ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs={"num_crops": num_crops}, limit_mm_per_prompt={"image": 1}) @@ -311,6 +314,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops, init_num_crops, inference_num_crops) ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=init_kwargs, limit_mm_per_prompt={"image": 1}) @@ -348,6 +352,7 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets, """Ensure that custom mappers filters out invalid mm_processor_kwargs""" # Should filter out the init time kwargs ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index eeac6ab43c05f..5a28943b7ecbc 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -4,8 +4,8 @@ """ import pytest -from tests.conftest import cleanup from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory MODEL_LEN_LEN = [ # Example models with sliding window. @@ -31,7 +31,7 @@ def test_disable_sliding_window(model_len_len, ): model_config.max_model_len) del vllm_disabled_model - cleanup() + cleanup_dist_env_and_memory() vllm_enabled_model = LLM(model, disable_sliding_window=False) vllm_enabled_model.generate("Hi my name is") @@ -41,4 +41,4 @@ def test_disable_sliding_window(model_len_len, ): model_config.max_model_len) del vllm_enabled_model - cleanup() + cleanup_dist_env_and_memory() diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index f2acf0d70afef..0f01f5f819ea4 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -107,8 +107,7 @@ def validate_generated_texts(hf_runner, quantization='bitsandbytes', load_format='bitsandbytes', tensor_parallel_size=vllm_tp_size, - enforce_eager=False, - gpu_memory_utilization=0.8) as llm: + enforce_eager=False) as llm: vllm_outputs = llm.generate_greedy(prompts, 8) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index d18233fe1aeae..cf77ccec7a191 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -57,7 +57,8 @@ def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: try: model_config = ModelConfig(model_path, - model_path, + task="auto", + tokenizer=model_path, tokenizer_mode="auto", trust_remote_code=False, seed=0, diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b450ef97c89d4..b9cb3858c0068 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -4,10 +4,10 @@ import pytest from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.utils import set_random_seed from vllm.sequence import PromptLogprobs, SampleLogprobs -from ...conftest import cleanup from ...models.utils import (TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, check_logprobs_close, check_outputs_equal) @@ -44,7 +44,7 @@ def generate(): yield llm del llm - cleanup() + cleanup_dist_env_and_memory() return generate diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 07b9c6b3c6be6..2a45653622448 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,27 +1,18 @@ -import contextlib import functools import gc from typing import Callable, TypeVar import pytest -import ray import torch from typing_extensions import ParamSpec -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel) +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @pytest.fixture(autouse=True) def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - ray.shutdown() - gc.collect() - torch.cuda.empty_cache() + cleanup_dist_env_and_memory(shutdown_ray=True) _P = ParamSpec("_P") diff --git a/tests/test_config.py b/tests/test_config.py index b89429005e1d0..69918b67607d9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,6 +2,42 @@ from vllm.config import ModelConfig + +@pytest.mark.parametrize(("model_id", "expected_task"), [ + ("facebook/opt-125m", "generate"), + ("intfloat/e5-mistral-7b-instruct", "embedding"), +]) +def test_auto_task(model_id, expected_task): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + ) + + assert config.task == expected_task + + +@pytest.mark.parametrize(("model_id", "bad_task"), [ + ("facebook/opt-125m", "embedding"), + ("intfloat/e5-mistral-7b-instruct", "generate"), +]) +def test_incorrect_task(model_id, bad_task): + with pytest.raises(ValueError, match=r"does not support the .* task"): + ModelConfig( + model_id, + task=bad_task, + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + ) + + MODEL_IDS_EXPECTED = [ ("Qwen/Qwen1.5-7B", 32768), ("mistralai/Mistral-7B-v0.1", 4096), @@ -14,7 +50,8 @@ def test_disable_sliding_window(model_id_expected): model_id, expected = model_id_expected model_config = ModelConfig( model_id, - model_id, + task="auto", + tokenizer=model_id, tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -32,7 +69,8 @@ def test_get_sliding_window(): # when use_sliding_window is False. qwen2_model_config = ModelConfig( "Qwen/Qwen1.5-7B", - "Qwen/Qwen1.5-7B", + task="auto", + tokenizer="Qwen/Qwen1.5-7B", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -49,7 +87,8 @@ def test_get_sliding_window(): mistral_model_config = ModelConfig( "mistralai/Mistral-7B-v0.1", - "mistralai/Mistral-7B-v0.1", + task="auto", + tokenizer="mistralai/Mistral-7B-v0.1", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -70,7 +109,8 @@ def test_rope_customization(): llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", + task="auto", + tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -82,7 +122,8 @@ def test_rope_customization(): llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", + task="auto", + tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -98,7 +139,8 @@ def test_rope_customization(): longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", + task="auto", + tokenizer="lmsys/longchat-13b-16k", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -112,7 +154,8 @@ def test_rope_customization(): longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", + task="auto", + tokenizer="lmsys/longchat-13b-16k", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", diff --git a/tests/test_utils.py b/tests/test_utils.py index 268e6f8194abb..0fed8e678fc76 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -59,7 +59,7 @@ def dummy(*, old_arg: object = None, new_arg: object = None): with pytest.warns(DeprecationWarning, match="'old_arg'"): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) @@ -69,10 +69,10 @@ def test_deprecate_kwargs_never(): def dummy(*, old_arg: object = None, new_arg: object = None): pass - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) @@ -86,15 +86,15 @@ def dummy(*, old_arg: object = None, new_arg: object = None): with pytest.warns(DeprecationWarning, match="'old_arg'"): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) is_deprecated = False - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py new file mode 100644 index 0000000000000..3095ef4516796 --- /dev/null +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -0,0 +1,275 @@ +import json +from typing import Generator, List, Optional + +import partial_json_parser +import pytest +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall, + ToolCall) +from vllm.entrypoints.openai.tool_parsers import JambaToolParser +from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + +MODEL = "ai21labs/Jamba-tiny-dev" + + +@pytest.fixture(scope="module") +def jamba_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture +def jamba_tool_parser(jamba_tokenizer): + return JambaToolParser(jamba_tokenizer) + + +def assert_tool_calls(actual_tool_calls: List[ToolCall], + expected_tool_calls: List[ToolCall]): + assert len(actual_tool_calls) == len(expected_tool_calls) + + for actual_tool_call, expected_tool_call in zip(actual_tool_calls, + expected_tool_calls): + assert isinstance(actual_tool_call.id, str) + assert len(actual_tool_call.id) > 16 + + assert actual_tool_call.type == "function" + assert actual_tool_call.function == expected_tool_call.function + + +def stream_delta_message_generator( + jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, + model_output: str) -> Generator[DeltaMessage, None, None]: + all_token_ids = jamba_tokenizer.encode(model_output, + add_special_tokens=False) + + previous_text = "" + previous_tokens = None + prefix_offset = 0 + read_offset = 0 + for i, delta_token in enumerate(all_token_ids): + delta_token_ids = [delta_token] + previous_token_ids = all_token_ids[:i] + current_token_ids = all_token_ids[:i + 1] + + (new_tokens, delta_text, new_prefix_offset, + new_read_offset) = detokenize_incrementally( + tokenizer=jamba_tokenizer, + all_input_ids=current_token_ids, + prev_tokens=previous_tokens, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=False, + spaces_between_special_tokens=True, + ) + + current_text = previous_text + delta_text + + delta_message = jamba_tool_parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request=None, # type: ignore[arg-type] + ) + if delta_message: + yield delta_message + + previous_text = current_text + previous_tokens = previous_tokens + new_tokens if previous_tokens\ + else new_tokens + prefix_offset = new_prefix_offset + read_offset = new_read_offset + + +def test_extract_tool_calls_no_tools(jamba_tool_parser): + model_output = "This is a test" + extracted_tool_calls = jamba_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +@pytest.mark.parametrize( + ids=[ + "single_tool", + "single_tool_with_content", + "parallel_tools", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + None), + ( + ''' Sure! let me call the tool for you.[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " Sure! let me call the tool for you."), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit" + }))) + ], + None) + ], +) +def test_extract_tool_calls(jamba_tool_parser, model_output, + expected_tool_calls, expected_content): + extracted_tool_calls = jamba_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert extracted_tool_calls.tools_called + + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + + assert extracted_tool_calls.content == expected_content + + +@pytest.mark.parametrize( + ids=[ + "no_tools", + "single_tool", + "single_tool_with_content", + "parallel_tools", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ('''This is a test''', [], '''This is a test'''), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " "), + ( + ''' Sure! let me call the tool for you.[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " Sure! let me call the tool for you."), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit" + }))) + ], + " ") + ], +) +def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer, + model_output, expected_tool_calls, + expected_content): + other_content: str = '' + function_names: List[str] = [] + function_args_strs: List[str] = [] + tool_call_idx: int = -1 + tool_call_ids: List[Optional[str]] = [] + + for delta_message in stream_delta_message_generator( + jamba_tool_parser, jamba_tokenizer, model_output): + # role should never be streamed from tool parser + assert not delta_message.role + + if delta_message.content: + other_content += delta_message.content + + streamed_tool_calls = delta_message.tool_calls + + if streamed_tool_calls and len(streamed_tool_calls) > 0: + # make sure only one diff is present - correct even for parallel + assert len(streamed_tool_calls) == 1 + tool_call = streamed_tool_calls[0] + + # if a new tool is being called, set up empty arguments + if tool_call.index != tool_call_idx: + tool_call_idx = tool_call.index + function_args_strs.append("") + tool_call_ids.append(None) + + # if a tool call ID is streamed, make sure one hasn't been already + if tool_call.id and not tool_call_ids[tool_call.index]: + tool_call_ids[tool_call.index] = tool_call.id + + # if parts of the function start being streamed + if tool_call.function: + # if the function name is defined, set it. it should be streamed + # IN ENTIRETY, exactly one time. + if tool_call.function.name: + assert isinstance(tool_call.function.name, str) + function_names.append(tool_call.function.name) + + if tool_call.function.arguments: + # make sure they're a string and then add them to the list + assert isinstance(tool_call.function.arguments, str) + + function_args_strs[ + tool_call.index] += tool_call.function.arguments + + assert other_content == expected_content + + actual_tool_calls = [ + ToolCall(id=tool_call_id, + function=FunctionCall( + name=function_name, + arguments=partial_json_parser.ensure_json( + function_args_str, Allow.OBJ | Allow.STR))) + for tool_call_id, function_name, function_args_str in zip( + tool_call_ids, function_names, function_args_strs) + ] + assert_tool_calls(actual_tool_calls, expected_tool_calls) diff --git a/tests/utils.py b/tests/utils.py index 115cab80691f0..2ab7329485dfc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,7 +8,7 @@ import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union import openai import pytest @@ -454,13 +454,13 @@ def multi_process_parallel( @contextmanager -def error_on_warning(): +def error_on_warning(category: Type[Warning] = Warning): """ Within the scope of this context manager, tests will fail if any warning - is emitted. + of the given category is emitted. """ with warnings.catch_warnings(): - warnings.simplefilter("error") + warnings.filterwarnings("error", category=category) yield diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py new file mode 100644 index 0000000000000..acd2ed6836365 --- /dev/null +++ b/tests/worker/test_profile.py @@ -0,0 +1,70 @@ +import torch + +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.worker import Worker + + +def test_gpu_memory_profiling(): + # Tests the gpu profiling that happens in order to determine the number of + # KV cache blocks that we can allocate on the GPU. + # This test mocks the maximum available gpu memory so that it can run on + # any gpu setup. + + # Set up engine args to build a worker. + engine_args = EngineArgs(model="facebook/opt-125m", + dtype="half", + load_format="dummy") + engine_config = engine_args.create_engine_config() + engine_config.cache_config.num_gpu_blocks = 1000 + engine_config.cache_config.num_cpu_blocks = 1000 + + # Create the worker. + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + worker = Worker( + model_config=engine_config.model_config, + parallel_config=engine_config.parallel_config, + scheduler_config=engine_config.scheduler_config, + device_config=engine_config.device_config, + cache_config=engine_config.cache_config, + load_config=engine_config.load_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + is_driver_worker=True, + ) + + # Load the model so we can profile it + worker.init_device() + worker.load_model() + + # Set 10GiB as the total gpu ram to be device-agnostic + def mock_mem_info(): + current_usage = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + mock_total_bytes = 10 * 1024**3 + free = mock_total_bytes - current_usage + + return (free, mock_total_bytes) + + from unittest.mock import patch + with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): + gpu_blocks, _ = worker.determine_num_available_blocks() + + # Peak vram usage by torch should be 0.7077 GiB + # No memory should be allocated outside of torch + # 9.0 GiB should be the utilization target + # 8.2923 GiB should be available for the KV cache + block_size = CacheEngine.get_cache_block_size( + engine_config.cache_config, engine_config.model_config, + engine_config.parallel_config) + + expected_blocks = (8.2923 * 1024**3) // block_size + + # Check within a small tolerance for portability + # Hardware, kernel, or dependency changes could all affect memory + # utilization. + # A 10 block tolerance here should be about 6MB of wiggle room. + assert abs(gpu_blocks - expected_blocks) < 10 diff --git a/tools/mypy.sh b/tools/mypy.sh index d69b61c7f34fc..14b0976a27da5 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -2,6 +2,10 @@ CI=${1:-0} +if [ $CI -eq 1 ]; then + set -e +fi + run_mypy() { echo "Running mypy on $1" if [ $CI -eq 1 ] && [ -z "$1" ]; then diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 2bc36ff18a96b..9ea89eca01f5b 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -15,8 +15,11 @@ class AttentionType(Enum): DECODER = auto() # Decoder attention between previous layer Q/K/V - ENCODER = auto() # Encoder attention between previous layer Q/K/V - ENCODER_DECODER = auto() # Attention between dec. Q and enc. K/V + ENCODER = auto( + ) # Encoder attention between previous layer Q/K/V for encoder-decoder + ENCODER_ONLY = auto() # Encoder attention between previous layer Q/K/V + ENCODER_DECODER = auto( + ) # Attention between dec. Q and enc. K/V for encoder-decoder class AttentionBackend(ABC): diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index d54dbdcb19495..ffa05e80623ac 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -32,7 +32,7 @@ def get_supported_head_sizes() -> List[int]: @staticmethod def get_name() -> str: - return "flash-attn" + return "FLASH_ATTN" @staticmethod def get_impl_cls() -> Type["FlashAttentionImpl"]: @@ -524,8 +524,8 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) + self.sliding_window = ((sliding_window - 1, + 0) if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. @@ -535,12 +535,6 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if sliding_window is not None: - # NOTE(woosuk): flash-attn's sliding window does not work with - # paged KV cache. - raise ValueError( - "Sliding window is not supported in FlashAttention.") - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() if head_size not in support_head_sizes: raise ValueError( @@ -704,6 +698,7 @@ def unified_flash_attention( max_seqlen_k=max_seq_len, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, block_table=prefill_meta.block_tables, softcap=logits_soft_cap, @@ -725,6 +720,7 @@ def unified_flash_attention( max_seqlen_k=decode_meta.max_decode_seq_len, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, block_table=decode_meta.block_tables, @@ -739,6 +735,7 @@ def unified_flash_attention( cache_seqlens=decode_meta.seq_lens_tensor, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, ).squeeze(1) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index dd9a0fb9d94df..e43fb134a6a5a 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -17,6 +17,7 @@ import torch +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, @@ -39,7 +40,7 @@ class FlashInferBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "flashinfer" + return "FLASHINFER" @staticmethod def get_impl_cls() -> Type["FlashInferImpl"]: @@ -124,7 +125,8 @@ def _get_decode_wrapper(self): self.runner.parallel_config)) num_kv_heads = self.runner.model_config.get_num_kv_heads( self.runner.parallel_config) - use_tensor_cores = num_qo_heads // num_kv_heads > 4 + use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( + num_qo_heads // num_kv_heads > 4) self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), "NHD", @@ -183,7 +185,8 @@ def graph_capture_get_metadata_for_batch( self.runner.parallel_config)) num_kv_heads = self.runner.model_config.get_num_kv_heads( self.runner.parallel_config) - use_tensor_cores = num_qo_heads // num_kv_heads > 4 + use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( + num_qo_heads // num_kv_heads > 4) self._graph_decode_wrapper = \ CUDAGraphBatchDecodeWithPagedKVCacheWrapper( self._graph_decode_workspace_buffer, _indptr_buffer, diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 7398732ddfc92..1eb5fe10d76db 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -19,7 +19,7 @@ class IpexAttnBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "ipex-attn" + return "IPEX" @staticmethod def get_impl_cls() -> Type["IpexAttnBackendImpl"]: diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 8b36230730380..6fddfc2002120 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -38,7 +38,7 @@ class OpenVINOAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "openvino" + return "OPENVINO" @staticmethod def get_impl_cls(): diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 56d3d3b482e58..6fee81de14420 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -11,6 +11,10 @@ class PallasAttentionBackend(AttentionBackend): + @staticmethod + def get_name() -> str: + return "PALLAS" + @staticmethod def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 3987986f1786b..4116fbf00020c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -20,7 +20,7 @@ class PlaceholderAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "placeholder-attn" + return "NO_ATTENTION" @staticmethod def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 682eac50126ad..c2aec4aaa74e7 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -28,7 +28,7 @@ class ROCmFlashAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "rocm-flash-attn" + return "ROCM_FLASH" @staticmethod def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ef8d576616838..1fb7c37578f20 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -25,7 +25,7 @@ class TorchSDPABackend(AttentionBackend): @staticmethod def get_name() -> str: - return "torch-sdpa" + return "TORCH_SDPA" @staticmethod def get_impl_cls() -> Type["TorchSDPABackendImpl"]: diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 358a223e7ed0e..d1a44f3e8bfa6 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -317,8 +317,8 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -337,8 +337,8 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) @@ -356,8 +356,8 @@ def prepare_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 25b86176f630e..5aaf13d8ea744 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -24,7 +24,7 @@ class XFormersBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "xformers" + return "XFORMERS" @staticmethod def get_impl_cls() -> Type["XFormersImpl"]: @@ -287,13 +287,15 @@ def _get_attn_bias( * Appropriate attention bias value given the attention type ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): return attn_metadata.attn_bias elif attn_type == AttentionType.ENCODER: return attn_metadata.encoder_attn_bias - else: - # attn_type == AttentionType.ENCODER_DECODER + elif attn_type == AttentionType.ENCODER_DECODER: return attn_metadata.cross_attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") def _set_attn_bias( @@ -313,7 +315,8 @@ def _set_attn_bias( encoder/decoder cross-attention ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): attn_metadata.attn_bias = attn_bias elif attn_type == AttentionType.ENCODER: attn_metadata.encoder_attn_bias = attn_bias @@ -371,6 +374,12 @@ def _get_seq_len_block_table_args( # No block tables associated with encoder attention return (attn_metadata.encoder_seq_lens_tensor, attn_metadata.max_encoder_seq_len, None) + elif attn_type == AttentionType.ENCODER_ONLY: + assert is_prompt, "Should not have decode for encoder only model." + + # No block tables associated with encoder attention + return (attn_metadata.seq_lens_tensor, + attn_metadata.max_prefill_seq_len, None) else: raise AttributeError(f"Invalid attention type {str(attn_type)}") @@ -479,7 +488,10 @@ def forward( * ENCODER: no KV caching; pass encoder sequence attributes (encoder_seq_lens/encoder_seq_lens_tensor/ max_encoder_seq_len) to kernel, in lieu of decoder - sequence attributes (seq_lens/seq_lens_tensor/max_seq_len) + sequence attributes (seq_lens/seq_lens_tensor/max_seq_len). + Used for encoder branch of encoder-decoder models. + * ENCODER_ONLY: no kv_caching, uses the normal attention + attributes (seq_lens/seq_lens_tensor/max_seq_len). * ENCODER_DECODER: cross-attention behavior; use cross-attention block table for caching KVs derived from encoder hidden states; since KV sequence lengths @@ -509,6 +521,7 @@ def forward( and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER and (not attn_metadata.is_all_cross_attn_metadata_set)): raise AttributeError("Encoder/decoder cross-attention " @@ -609,6 +622,8 @@ def forward( assert out.shape == output[:num_prefill_tokens].shape output[:num_prefill_tokens] = out else: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have prefix attention.") assert prefill_meta.query_start_loc is not None assert prefill_meta.max_query_len is not None @@ -638,6 +653,8 @@ def forward( output[:num_prefill_tokens] = out if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") ( seq_lens_arg, @@ -703,36 +720,60 @@ def _run_memory_efficient_xformers_forward( None, :].expand(value.shape[0], self.num_kv_heads, self.num_queries_per_kv, value.shape[-1]) + # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. attn_bias = _get_attn_bias(attn_metadata, attn_type) if attn_bias is None: if self.alibi_slopes is None: + + # Cross attention block of decoder branch of encoder-decoder + # model uses seq_lens for dec / encoder_seq_lens for enc if (attn_type == AttentionType.ENCODER_DECODER): assert attn_metadata.seq_lens is not None assert attn_metadata.encoder_seq_lens is not None - # Default enc/dec cross-attention mask is non-causal + # Cross-attention mask is non-causal attn_bias = BlockDiagonalMask.from_seqlens( attn_metadata.seq_lens, attn_metadata.encoder_seq_lens) + + # Encoder branch of encoder-decoder model uses + # attn_metadata.encoder_seq_lens elif attn_type == AttentionType.ENCODER: + assert attn_metadata.encoder_seq_lens is not None - # Default encoder self-attention mask is non-causal + # Encoder self-attention mask is non-causal attn_bias = BlockDiagonalMask.from_seqlens( attn_metadata.encoder_seq_lens) - else: + + # Self-attention block of encoder-only model just + # uses the seq_lens directly. + elif attn_type == AttentionType.ENCODER_ONLY: assert attn_metadata.seq_lens is not None - # Default decoder self-attention mask is causal + # Encoder self-attention mask is non-causal + attn_bias = BlockDiagonalMask.from_seqlens( + attn_metadata.seq_lens) + + # Self-attention block of decoder branch just + # uses the seq_lens directly + elif attn_type == AttentionType.DECODER: + assert attn_metadata.seq_lens is not None + + # Decoder self-attention mask is causal attn_bias = BlockDiagonalCausalMask.from_seqlens( attn_metadata.seq_lens) + else: + raise ValueError("Unknown AttentionType: %s", attn_type) + if self.sliding_window is not None: attn_bias = attn_bias.make_local_attention( self.sliding_window) attn_bias = [attn_bias] else: + assert attn_type == AttentionType.DECODER assert attn_metadata.seq_lens is not None attn_bias = _make_alibi_bias(self.alibi_slopes, self.num_kv_heads, query.dtype, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b46f0721d0caf..33d05cbd3fe01 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -78,10 +78,9 @@ def __init__( # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(head_size, sliding_window, dtype, - kv_cache_dtype, block_size, - is_attention_free, blocksparse_params - is not None) + attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype, + block_size, is_attention_free, + blocksparse_params is not None) impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 7edb7676ea2cd..4ff86573e664d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -90,7 +90,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]: @lru_cache(maxsize=None) def get_attn_backend( head_size: int, - sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, @@ -105,8 +104,8 @@ def get_attn_backend( BlocksparseFlashAttentionBackend) return BlocksparseFlashAttentionBackend - backend = which_attn_to_use(head_size, sliding_window, dtype, - kv_cache_dtype, block_size, is_attention_free) + backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size, + is_attention_free) if backend == _Backend.FLASH_ATTN: from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) @@ -155,7 +154,6 @@ def get_attn_backend( def which_attn_to_use( head_size: int, - sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, @@ -243,10 +241,6 @@ def which_attn_to_use( "Cannot use FlashAttention-2 backend for block size not " "divisible by 16.") selected_backend = _Backend.XFORMERS - elif sliding_window is not None: - logger.info( - "Cannot use FlashAttention-2 backend due to sliding window.") - selected_backend = _Backend.XFORMERS # FlashAttn is valid for the model, checking if the package is installed. if selected_backend == _Backend.FLASH_ATTN: diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 04624b8b94432..1b48538734dae 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,5 +1,7 @@ from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List, Optional + +from vllm.sequence import Logprob @dataclass @@ -11,6 +13,7 @@ class BeamSearchSequence: """ # The tokens includes the prompt. tokens: List[int] + logprobs: List[Dict[int, Logprob]] cum_logprob: float = 0.0 text: Optional[str] = None @@ -28,7 +31,7 @@ class BeamSearchInstance: def __init__(self, prompt_tokens: List[int]): self.beams: List[BeamSearchSequence] = [ - BeamSearchSequence(tokens=prompt_tokens) + BeamSearchSequence(tokens=prompt_tokens, logprobs=[]) ] self.completed: List[BeamSearchSequence] = [] diff --git a/vllm/config.py b/vllm/config.py index 4533fb017188c..f57aa4048ae9b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,8 +1,8 @@ import enum import json from dataclasses import dataclass, field, fields -from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping, - Optional, Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal, + Mapping, Optional, Set, Tuple, Type, Union) import torch from transformers import PretrainedConfig @@ -33,6 +33,11 @@ _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 +TaskOption = Literal["auto", "generate", "embedding"] + +# "draft" is only used internally for speculative decoding +_Task = Literal["generate", "embedding", "draft"] + class ModelConfig: """Configuration for the model. @@ -40,7 +45,11 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. It is also used as the content for `model_name` tag in metrics - output when `served_model_name` is not specified. + output when `served_model_name` is not specified. + task: The task to use the model for. Each vLLM instance only supports + one task, even if the same model can be used for multiple tasks. + When the model only supports one task, "auto" can be used to select + it; otherwise, you must specify explicitly which task to use. tokenizer: Name or path of the huggingface tokenizer to use. tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if available, "slow" will always use the slow tokenizer, and @@ -108,6 +117,7 @@ class ModelConfig: def __init__(self, model: str, + task: Union[TaskOption, _Task], tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, @@ -207,7 +217,11 @@ def __init__(self, self.override_neuron_config = override_neuron_config if is_neuron( ) else None - self._verify_embedding_mode() + + supported_tasks, task = self._resolve_task(task, self.hf_config) + self.supported_tasks = supported_tasks + self.task: Final = task + self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -241,18 +255,44 @@ def _verify_tokenizer_mode(self) -> None: "either 'auto', 'slow' or 'mistral'.") self.tokenizer_mode = tokenizer_mode - def _verify_embedding_mode(self) -> None: - architectures = getattr(self.hf_config, "architectures", []) + def _resolve_task( + self, + task_option: Union[TaskOption, _Task], + hf_config: PretrainedConfig, + ) -> Tuple[Set[_Task], _Task]: + if task_option == "draft": + return {"draft"}, "draft" + + architectures = getattr(hf_config, "architectures", []) + + task_support: Dict[_Task, bool] = { + # NOTE: Listed from highest to lowest priority, + # in case the model supports multiple of them + "generate": ModelRegistry.is_text_generation_model(architectures), + "embedding": ModelRegistry.is_embedding_model(architectures), + } + supported_tasks_lst: List[_Task] = [ + task for task, is_supported in task_support.items() if is_supported + ] + supported_tasks = set(supported_tasks_lst) + + if task_option == "auto": + selected_task = next(iter(supported_tasks_lst)) - # TODO: Allow the same model architecture to be specified as either - # generation or embedding model - if "Phi3VForCausalLM" in architectures: - # Match both remote and local names - embedding_mode = "/VLM2Vec" in self.model + if len(supported_tasks) > 1: + logger.info( + "This model supports multiple tasks: %s. " + "Defaulting to '%s'.", supported_tasks, selected_task) else: - embedding_mode = ModelRegistry.is_embedding_model(architectures) + if task_option not in supported_tasks: + msg = ( + f"This model does not support the '{task_option}' task. " + f"Supported tasks: {supported_tasks}") + raise ValueError(msg) + + selected_task = task_option - self.embedding_mode = embedding_mode + return supported_tasks, selected_task def _parse_quant_hf_config(self): quant_cfg = getattr(self.hf_config, "quantization_config", None) @@ -401,7 +441,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Async postprocessor is not necessary with embedding mode # since there is no token generation - if self.embedding_mode: + if self.task == "embedding": self.use_async_output_proc = False # Reminder: Please update docs/source/serving/compatibility_matrix.rst @@ -582,11 +622,6 @@ def is_encoder_decoder_model(self) -> bool: (hasattr(self.hf_config, "text_config") and getattr( self.hf_config.text_config, "is_encoder_decoder", False))) - @property - def is_embedding_model(self) -> bool: - """Extract the embedding model flag.""" - return self.embedding_mode - @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None @@ -943,6 +978,7 @@ class SchedulerConfig: """Scheduler configuration. Args: + task: The task to use the model for. max_num_batched_tokens: Maximum number of tokens to be processed in a single iteration. max_num_seqs: Maximum number of sequences to be processed in a single @@ -957,7 +993,6 @@ class SchedulerConfig: prompt latency) before scheduling next prompt. enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. - embedding_mode: Whether the running model is for embedding. preemption_mode: Whether to perform preemption by swapping or recomputation. If not specified, we determine the mode as follows: We use recomputation by default since it incurs lower overhead than @@ -972,13 +1007,13 @@ class SchedulerConfig: """ def __init__(self, + task: _Task, max_num_batched_tokens: Optional[int], max_num_seqs: int, max_model_len: int, num_lookahead_slots: int = 0, delay_factor: float = 0.0, enable_chunked_prefill: bool = False, - embedding_mode: bool = False, is_multimodal_model: bool = False, preemption_mode: Optional[str] = None, num_scheduler_steps: int = 1, @@ -1002,7 +1037,7 @@ def __init__(self, # for higher throughput. max_num_batched_tokens = max(max_model_len, 2048) - if embedding_mode: + if task == "embedding": # For embedding, choose specific value for higher throughput max_num_batched_tokens = max( max_num_batched_tokens, @@ -1022,12 +1057,12 @@ def __init__(self, "Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens) + self.task: Final = task self.max_num_seqs = max_num_seqs self.max_model_len = max_model_len self.num_lookahead_slots = num_lookahead_slots self.delay_factor = delay_factor self.chunked_prefill_enabled = enable_chunked_prefill - self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode self.num_scheduler_steps = num_scheduler_steps self.multi_step_stream_outputs = multi_step_stream_outputs @@ -1239,6 +1274,7 @@ def maybe_create_spec_config( ngram_prompt_lookup_min = 0 draft_model_config = ModelConfig( model=speculative_model, + task="draft", tokenizer=target_model_config.tokenizer, tokenizer_mode=target_model_config.tokenizer_mode, trust_remote_code=target_model_config.trust_remote_code, diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f0c8e6bab4862..8d3fce106dd2c 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -313,7 +313,7 @@ def __init__( self.lora_config = lora_config version = "selfattn" - if (self.scheduler_config.embedding_mode + if (self.scheduler_config.task == "embedding" or self.cache_config.is_attention_free): version = "placeholder" diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6e1970bfed98a..8d4b673d2e6e4 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -20,6 +20,7 @@ steps. """ import contextlib +import gc import pickle import weakref from collections import namedtuple @@ -36,7 +37,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import supports_custom_op +from vllm.utils import is_cpu, supports_custom_op @dataclass @@ -1129,6 +1130,19 @@ def destroy_distributed_environment(): torch.distributed.destroy_process_group() +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() + gc.collect() + if not is_cpu(): + torch.cuda.empty_cache() + + def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]: """ This is a collective operation that returns if each rank is in the same node diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 41963dcb16922..56582ab618797 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,7 +3,7 @@ import json from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, - Tuple, Type, Union, cast) + Tuple, Type, Union, cast, get_args) import torch @@ -12,7 +12,7 @@ DeviceConfig, EngineConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TokenizerPoolConfig) + SpeculativeConfig, TaskOption, TokenizerPoolConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -84,6 +84,7 @@ class EngineArgs: model: str = 'facebook/opt-125m' served_model_name: Optional[Union[str, List[str]]] = None tokenizer: Optional[str] = None + task: TaskOption = "auto" skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False @@ -198,6 +199,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=str, default=EngineArgs.model, help='Name or path of the huggingface model to use.') + parser.add_argument( + '--task', + default=EngineArgs.task, + choices=get_args(TaskOption), + help='The task to use the model for. Each vLLM instance only ' + 'supports one task, even if the same model can be used for ' + 'multiple tasks. When the model only supports one task, "auto" ' + 'can be used to select it; otherwise, you must specify explicitly ' + 'which task to use.') parser.add_argument( '--tokenizer', type=nullable_str, @@ -418,7 +428,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='The fraction of GPU memory to be used for the model ' 'executor, which can range from 0 to 1. For example, a value of ' '0.5 would imply 50%% GPU memory utilization. If unspecified, ' - 'will use the default value of 0.9.') + 'will use the default value of 0.9. This is a global gpu memory ' + 'utilization limit, for example if 50%% of the gpu memory is ' + 'already used before vLLM starts and --gpu-memory-utilization is ' + 'set to 0.9, then only 40%% of the gpu memory will be allocated ' + 'to the model executor.') parser.add_argument( '--num-gpu-blocks-override', type=int, @@ -838,6 +852,7 @@ def from_cli_args(cls, args: argparse.Namespace): def create_model_config(self) -> ModelConfig: return ModelConfig( model=self.model, + task=self.task, # We know this is not None because we set it in __post_init__ tokenizer=cast(str, self.tokenizer), tokenizer_mode=self.tokenizer_mode, @@ -1026,13 +1041,13 @@ def create_engine_config(self) -> EngineConfig: " please file an issue with detailed information.") scheduler_config = SchedulerConfig( + task=model_config.task, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, num_lookahead_slots=num_lookahead_slots, delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, - embedding_mode=model_config.embedding_mode, is_multimodal_model=model_config.is_multimodal_model, preemption_mode=self.preemption_mode, num_scheduler_steps=self.num_scheduler_steps, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 61c21887e6816..a90bfce8491fb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,4 +1,5 @@ import time +from collections import Counter as collectionsCounter from collections import deque from contextlib import contextmanager from dataclasses import dataclass @@ -344,7 +345,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: observability_config=self.observability_config, ) - if not self.model_config.embedding_mode: + if self.model_config.task != "embedding": self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. @@ -1116,7 +1117,7 @@ def _process_model_outputs(self, seq_group.metrics.model_execute_time = ( o.model_execute_time) - if self.model_config.embedding_mode: + if self.model_config.task == "embedding": self._process_sequence_group_outputs(seq_group, output) else: self.output_processor.process_prompt_logprob(seq_group, output) @@ -1617,6 +1618,25 @@ def _get_stats(self, n_requests: List[int] = [] finished_reason_requests: List[str] = [] + # Lora requests + running_lora_adapters = dict( + collectionsCounter([ + running_request.lora_request.lora_name + for scheduler in self.scheduler + for running_request in scheduler.running + if running_request.lora_request + ])) + waiting_lora_adapters = dict( + collectionsCounter([ + waiting_request.lora_request.lora_name + for scheduler in self.scheduler + for waiting_request in scheduler.waiting + if waiting_request.lora_request + ])) + max_lora_stat = "0" + if self.lora_config: + max_lora_stat = str(self.lora_config.max_loras) + # NOTE: This loop assumes prefill seq_groups are before # decode seq_groups in scheduled_seq_groups. if scheduler_outputs is not None: @@ -1738,7 +1758,9 @@ def _get_stats(self, num_generation_tokens_requests=num_generation_tokens_requests, n_requests=n_requests, finished_reason_requests=finished_reason_requests, - ) + max_lora=str(max_lora_stat), + waiting_lora_adapters=list(waiting_lora_adapters.keys()), + running_lora_adapters=list(running_lora_adapters.keys())) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) @@ -1855,9 +1877,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() - def is_embedding_model(self): - return self.model_config.is_embedding_model - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]): if self.model_config.is_multimodal_model: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 98bf59be3469d..a46625eff1e4a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -34,7 +34,11 @@ class Metrics: See https://prometheus.github.io/client_python/multiprocess/ for more details on limitations. """ + labelname_finish_reason = "finished_reason" + labelname_waiting_lora_adapters = "waiting_lora_adapters" + labelname_running_lora_adapters = "running_lora_adapters" + labelname_max_lora = "max_lora" _gauge_cls = prometheus_client.Gauge _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram @@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Number of requests waiting to be processed.", labelnames=labelnames, multiprocess_mode="sum") + self.gauge_lora_info = self._gauge_cls( + name="vllm:lora_requests_info", + documentation="Running stats on lora requests.", + labelnames=[ + self.labelname_running_lora_adapters, + self.labelname_max_lora, + self.labelname_waiting_lora_adapters, + ], + multiprocess_mode="livemostrecent", + ) self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", documentation="Number of requests swapped to CPU.", @@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int], for datum in data: histogram.labels(**self.labels).observe(datum) + def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: + gauge.labels(**data).set(1) + def _log_prometheus(self, stats: Stats) -> None: # System state data self._log_gauge(self.metrics.gauge_scheduler_running, @@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None: stats.cpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, stats.gpu_prefix_cache_hit_rate) - + # Including max-lora in metric, in future this property of lora + # config maybe extended to be dynamic. + lora_info = { + self.metrics.labelname_running_lora_adapters: + ",".join(stats.running_lora_adapters), + self.metrics.labelname_waiting_lora_adapters: + ",".join(stats.waiting_lora_adapters), + self.metrics.labelname_max_lora: + stats.max_lora, + } + self._log_gauge_string(self.metrics.gauge_lora_info, lora_info) # Iteration level data self._log_counter(self.metrics.counter_num_preemption, stats.num_preemption_iter) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index bafd5fa1a8a82..e9a5bd3b586be 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -51,6 +51,9 @@ class Stats: num_generation_tokens_requests: List[int] n_requests: List[int] finished_reason_requests: List[str] + waiting_lora_adapters: List[str] + running_lora_adapters: List[str] + max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 16ceddf13511c..5c504e0f0217d 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -59,7 +59,7 @@ def generate( async def beam_search( self, - prompt: Union[PromptType, List[int]], + prompt: Union[str, List[int]], request_id: str, params: BeamSearchParams, ) -> AsyncGenerator[RequestOutput, None]: @@ -71,9 +71,13 @@ async def beam_search( length_penalty = params.length_penalty tokenizer = await self.get_tokenizer(lora_request=None) - tokenizedPrompt = prompt if isinstance( - prompt, list) else tokenizer.encode(prompt) - tokenizedLength = len(tokenizedPrompt) + if isinstance(prompt, str): + tokenized_prompt = tokenizer.encode(prompt) + prompt_text = prompt + else: + tokenized_prompt = prompt + prompt_text = None + tokenized_length = len(tokenized_prompt) sort_beams_key = create_sort_beams_key_function( tokenizer.eos_token_id, length_penalty) @@ -81,7 +85,11 @@ async def beam_search( beam_search_params = SamplingParams(logprobs=2 * beam_width, max_tokens=1, temperature=temperature) - all_beams = [BeamSearchSequence(tokens=tokenizedPrompt, cum_logprob=0)] + all_beams = [ + BeamSearchSequence(tokens=tokenized_prompt, + logprobs=[], + cum_logprob=0) + ] completed = [] for _ in range(max_tokens): @@ -114,6 +122,7 @@ async def beam_search( for token_id, logprob_obj in logprobs.items(): new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], cum_logprob=current_beam.cum_logprob + logprob_obj.logprob) @@ -131,22 +140,22 @@ async def beam_search( best_beams = sorted_completed[:beam_width] for beam in best_beams: - beam.text = tokenizer.decode(beam.tokens[tokenizedLength:]) + beam.text = tokenizer.decode(beam.tokens[tokenized_length:]) beam_search_output = RequestOutput( request_id=request_id, - prompt=prompt, + prompt=prompt_text, outputs=[ CompletionOutput( text=beam.text, cumulative_logprob=beam.cum_logprob, - token_ids=beam.tokens, + token_ids=beam.tokens[tokenized_length:], index=i, - logprobs=beam.cum_logprob, + logprobs=beam.logprobs, ) for (i, beam) in enumerate(best_beams) ], finished=True, - prompt_token_ids=tokenizedPrompt, + prompt_token_ids=tokenized_prompt, prompt_logprobs=None) yield beam_search_output diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 785dcbfa83119..f64af27a957be 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -5,8 +5,8 @@ from collections import defaultdict from functools import lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal, - Mapping, Optional, Tuple, TypeVar, Union, cast) +from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, + Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) # yapf conflicts with isort for this block # yapf: disable @@ -33,6 +33,7 @@ async_get_and_parse_image, get_and_parse_audio, get_and_parse_image) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -58,10 +59,35 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): """The type of the content part.""" +class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain image_url. + This is supported by OpenAI API, although it is not documented. + + Example: + { + "image_url": "https://example.com/image.jpg" + } + """ + image_url: Required[str] + + +class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "audio_url": "https://example.com/audio.mp3" + } + """ + audio_url: Required[str] + + ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam] + CustomChatCompletionContentPartParam, + CustomChatCompletionContentSimpleImageParam, + CustomChatCompletionContentSimpleAudioParam, str] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -386,6 +412,71 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} +# Define a mapping from part types to their corresponding parsing functions. +MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { + "text": + lambda part: _TextParser(part).get("text", ""), + "image_url": + lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), + "audio_url": + lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + "refusal": + lambda part: _RefusalParser(part).get("refusal", ""), +} + + +def _parse_chat_message_content_mm_part( + part: ChatCompletionContentPartParam) -> Tuple[str, str]: + """ + Parses a given multi modal content part based on its type. + + Args: + part: A dict containing the content part, with a potential 'type' field. + + Returns: + A tuple (part_type, content) where: + - part_type: Type of the part (e.g., 'text', 'image_url'). + - content: Parsed content (e.g., text, image URL). + + Raises: + ValueError: If the 'type' field is missing and no direct URL is found. + """ + assert isinstance( + part, dict) # This is needed to avoid mypy errors: part.get() from str + part_type = part.get("type", None) + + if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + content = MM_PARSER_MAP[part_type](part) + + # Special case for 'image_url.detail' + if part_type == "image_url" and part.get("detail") != "auto": + logger.warning("'image_url.detail' is currently not supported " + "and will be ignored.") + + return part_type, content + + # Handle missing 'type' but provided direct URL fields. + if part_type is None: + if part.get("image_url") is not None: + image_params = cast(CustomChatCompletionContentSimpleImageParam, + part) + return "image_url", image_params.get("image_url", "") + if part.get("audio_url") is not None: + audio_params = cast(CustomChatCompletionContentSimpleAudioParam, + part) + return "audio_url", audio_params.get("audio_url", "") + + # Raise an error if no 'type' or direct URL is found. + raise ValueError("Missing 'type' field in multimodal part.") + + if not isinstance(part_type, str): + raise ValueError("Invalid 'type' field in multimodal part.") + return part_type, "unknown part_type content" + + +VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", + "audio_url") + def _parse_chat_message_content_parts( role: str, @@ -401,29 +492,28 @@ def _parse_chat_message_content_parts( has_image = False for part in parts: - part_type = part["type"] - if part_type == "text": - text = _TextParser(part)["text"] + if isinstance(part, str): # Handle plain text parts + text = _TextParser(part) texts.append(text) - elif part_type == "image_url": - image_url = _ImageParser(part)["image_url"] - - if image_url.get("detail", "auto") != "auto": - logger.warning( - "'image_url.detail' is currently not supported and " - "will be ignored.") - - mm_parser.parse_image(image_url["url"]) - has_image = True - elif part_type == "audio_url": - audio_url = _AudioParser(part)["audio_url"] - - mm_parser.parse_audio(audio_url["url"]) - elif part_type == "refusal": - text = _RefusalParser(part)["refusal"] - texts.append(text) - else: - raise NotImplementedError(f"Unknown part type: {part_type}") + else: # Handle structured dictionary parts + part_type, content = _parse_chat_message_content_mm_part(part) + + # if part_type is text/refusal/image_url/audio_url but + # content is empty, logg a warning and skip + if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: + logger.warning("Skipping multimodal part " + "with empty / unparsable content.") + continue + + if part_type in ("text", "refusal"): + texts.append(content) + elif part_type == "image_url": + mm_parser.parse_image(content) + has_image = True + elif part_type == "audio_url": + mm_parser.parse_audio(content) + else: + raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) if keep_multimodal_content: @@ -564,14 +654,14 @@ def apply_mistral_chat_template( **kwargs: Any, ) -> List[int]: if chat_template is not None: - logger.warning( + print_warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") if "add_generation_prompt" in kwargs: - logger.warning( + print_warning_once( "'add_generation_prompt' is not supported for mistral tokenizer, " "so it will be ignored.") if "continue_final_message" in kwargs: - logger.warning( + print_warning_once( "'continue_final_message' is not supported for mistral tokenizer, " "so it will be ignored.") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2010381076c7d..1f7893d54de68 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -8,7 +8,7 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import EngineArgs, TaskOption from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_hf_chat_template, @@ -29,7 +29,7 @@ get_cached_tokenizer) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, deprecate_kwargs, is_list_of +from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of logger = init_logger(__name__) @@ -108,6 +108,12 @@ class LLM: DEPRECATE_LEGACY: ClassVar[bool] = False """A flag to toggle whether to deprecate the legacy generate/encode API.""" + DEPRECATE_INIT_POSARGS: ClassVar[bool] = True + """ + A flag to toggle whether to deprecate positional arguments in + :meth:`LLM.__init__`. + """ + @classmethod @contextmanager def deprecate_legacy_api(cls): @@ -117,6 +123,13 @@ def deprecate_legacy_api(cls): cls.DEPRECATE_LEGACY = False + @deprecate_args( + start_index=2, # Ignore self and model + is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS, + additional_message=( + "All positional arguments other than `model` will be " + "replaced with keyword arguments in an upcoming version."), + ) def __init__( self, model: str, @@ -139,6 +152,8 @@ def __init__( disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, mm_processor_kwargs: Optional[Dict[str, Any]] = None, + # After positional args are removed, move this right below `model` + task: TaskOption = "auto", **kwargs, ) -> None: ''' @@ -153,6 +168,7 @@ def __init__( engine_args = EngineArgs( model=model, + task=task, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, skip_tokenizer_init=skip_tokenizer_init, @@ -316,10 +332,21 @@ def generate( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - if self.llm_engine.model_config.embedding_mode: - raise ValueError( + task = self.llm_engine.model_config.task + if task != "generate": + messages = [ "LLM.generate() is only supported for (conditional) generation " - "models (XForCausalLM, XForConditionalGeneration).") + "models (XForCausalLM, XForConditionalGeneration).", + ] + + supported_tasks = self.llm_engine.model_config.supported_tasks + if "generate" in supported_tasks: + messages.append( + "Your model supports the 'generate' task, but is " + f"currently initialized for the '{task}' task. Please " + "initialize the model using `--task generate`.") + + raise ValueError(" ".join(messages)) if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( @@ -433,6 +460,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float: for token_id, logprob_obj in logprobs.items(): new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], cum_logprob=current_beam.cum_logprob + logprob_obj.logprob) @@ -691,10 +719,18 @@ def encode( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - if not self.llm_engine.model_config.embedding_mode: - raise ValueError( - "LLM.encode() is only supported for embedding models (XModel)." - ) + task = self.llm_engine.model_config.task + if task != "embedding": + messages = ["LLM.encode() is only supported for embedding models."] + + supported_tasks = self.llm_engine.model_config.supported_tasks + if "embedding" in supported_tasks: + messages.append( + "Your model supports the 'embedding' task, but is " + f"currently initialized for the '{task}' task. Please " + "initialize the model using `--task embedding`.") + + raise ValueError(" ".join(messages)) if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( @@ -904,6 +940,3 @@ def _run_engine( def _is_encoder_decoder_model(self): return self.llm_engine.is_encoder_decoder_model() - - def _is_embedding_model(self): - return self.llm_engine.is_embedding_model() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6f1135f8093ba..06114339b7c69 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -314,9 +314,15 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: prompt_logprobs = self.top_logprobs guided_json_object = None - if (self.response_format is not None - and self.response_format.type == "json_object"): - guided_json_object = True + if self.response_format is not None: + if self.response_format.type == "json_object": + guided_json_object = True + elif self.response_format.type == "json_schema": + json_schema = self.response_format.json_schema + assert json_schema is not None + self.guided_json = json_schema.json_schema + if self.guided_decoding_backend is None: + self.guided_decoding_backend = "lm-format-enforcer" guided_decoding = GuidedDecodingParams.from_optional( json=self._get_guided_json_from_tool() or self.guided_json, @@ -537,8 +543,8 @@ class CompletionRequest(OpenAIBaseModel): default=None, description= ("Similar to chat completion, this parameter specifies the format of " - "output. Only {'type': 'json_object'} or {'type': 'text' } is " - "supported."), + "output. Only {'type': 'json_object'}, {'type': 'json_schema'} or " + "{'type': 'text' } is supported."), ) guided_json: Optional[Union[str, dict, BaseModel]] = Field( default=None, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1e08cd9712bc0..56e35950410a0 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -258,6 +258,14 @@ async def completion_stream_generator( has_echoed = [False] * num_choices * num_prompts num_prompt_tokens = [0] * num_prompts + stream_options = request.stream_options + if stream_options: + include_usage = stream_options.include_usage + include_continuous_usage = include_usage and \ + stream_options.continuous_usage_stats + else: + include_usage, include_continuous_usage = False, False + try: async for prompt_idx, res in result_generator: prompt_token_ids = res.prompt_token_ids @@ -276,28 +284,25 @@ async def completion_stream_generator( i = output.index + prompt_idx * num_choices assert request.max_tokens is not None - if request.echo and request.max_tokens == 0: + if request.echo and not has_echoed[i]: assert prompt_token_ids is not None assert prompt_text is not None - # only return the prompt - delta_text = prompt_text - delta_token_ids = prompt_token_ids - out_logprobs = prompt_logprobs - has_echoed[i] = True - elif (request.echo and request.max_tokens > 0 - and not has_echoed[i]): - assert prompt_token_ids is not None - assert prompt_text is not None - assert prompt_logprobs is not None - # echo the prompt and first token - delta_text = prompt_text + output.text - delta_token_ids = [ - *prompt_token_ids, *output.token_ids - ] - out_logprobs = [ - *prompt_logprobs, - *(output.logprobs or []), - ] + if request.max_tokens == 0: + # only return the prompt + delta_text = prompt_text + delta_token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + else: + assert prompt_logprobs is not None + # echo the prompt and first token + delta_text = prompt_text + output.text + delta_token_ids = [ + *prompt_token_ids, *output.token_ids + ] + out_logprobs = [ + *prompt_logprobs, + *(output.logprobs or []), + ] has_echoed[i] = True else: # return just the delta @@ -341,45 +346,39 @@ async def completion_stream_generator( stop_reason=stop_reason, ) ]) - if (request.stream_options - and request.stream_options.include_usage): - if (request.stream_options.continuous_usage_stats - or output.finish_reason is not None): - prompt_tokens = num_prompt_tokens[prompt_idx] - completion_tokens = previous_num_tokens[i] - usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - if request.stream_options.continuous_usage_stats: - chunk.usage = usage - else: - chunk.usage = None + if include_continuous_usage: + prompt_tokens = num_prompt_tokens[prompt_idx] + completion_tokens = previous_num_tokens[i] + chunk.usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) response_json = chunk.model_dump_json(exclude_unset=False) yield f"data: {response_json}\n\n" - if (request.stream_options - and request.stream_options.include_usage): + total_prompt_tokens = sum(num_prompt_tokens) + total_completion_tokens = sum(previous_num_tokens) + final_usage_info = UsageInfo( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens) + + if include_usage: final_usage_chunk = CompletionStreamResponse( id=request_id, created=created_time, model=model_name, choices=[], - usage=usage, + usage=final_usage_info, ) final_usage_data = (final_usage_chunk.model_dump_json( exclude_unset=False, exclude_none=True)) yield f"data: {final_usage_data}\n\n" # report to FastAPI middleware aggregate usage across all choices - total_prompt_tokens = sum(num_prompt_tokens) - total_completion_tokens = sum(previous_num_tokens) - request_metadata.final_usage_info = UsageInfo( - prompt_tokens=total_prompt_tokens, - completion_tokens=total_completion_tokens, - total_tokens=total_prompt_tokens + total_completion_tokens) + request_metadata.final_usage_info = final_usage_info except ValueError as e: # TODO: Use a vllm-specific Validation Error @@ -413,26 +412,26 @@ def request_output_to_completion_response( for output in final_res.outputs: assert request.max_tokens is not None - if request.echo and request.max_tokens == 0: - assert prompt_text is not None - token_ids = prompt_token_ids - out_logprobs = prompt_logprobs - output_text = prompt_text - elif request.echo and request.max_tokens > 0: + if request.echo: assert prompt_text is not None - token_ids = [*prompt_token_ids, *output.token_ids] - - if request.logprobs is None: - out_logprobs = None + if request.max_tokens == 0: + token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + output_text = prompt_text else: - assert prompt_logprobs is not None - assert output.logprobs is not None - out_logprobs = [ - *prompt_logprobs, - *output.logprobs, - ] - - output_text = prompt_text + output.text + token_ids = [*prompt_token_ids, *output.token_ids] + + if request.logprobs is None: + out_logprobs = None + else: + assert prompt_logprobs is not None + assert output.logprobs is not None + out_logprobs = [ + *prompt_logprobs, + *output.logprobs, + ] + + output_text = prompt_text + output.text else: token_ids = output.token_ids out_logprobs = output.logprobs diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e9504cfa64b65..6c46aae2838f6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -83,7 +83,8 @@ def __init__( lora_modules=None, prompt_adapters=None, request_logger=request_logger) - self._enabled = self._check_embedding_mode(model_config.embedding_mode) + self._enabled = self._check_embedding_mode( + model_config.task == "embedding") async def create_embedding( self, diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 309d9bede489b..0e88bb21ca75f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,10 +1,12 @@ from .abstract_tool_parser import ToolParser, ToolParserManager from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser +from .jamba_tool_parser import JambaToolParser from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser __all__ = [ "ToolParser", "ToolParserManager", "Hermes2ProToolParser", - "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser" + "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser", + "JambaToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e7ea82ebd5411..faa6f653b835c 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -53,7 +53,8 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_call_start_token_id = self.vocab.get( self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) - if not self.tool_call_start_token_id or not self.tool_call_end_token_id: + if (self.tool_call_start_token_id is None + or self.tool_call_end_token_id is None): raise RuntimeError( "Hermes 2 Pro Tool parser could not locate tool call start/end " "tokens in the tokenizer!") diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py new file mode 100644 index 0000000000000..cfd024853f887 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -0,0 +1,300 @@ +import json +import re +from typing import Dict, List, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.openai.tool_parsers.utils import ( + extract_intermediate_diff) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("jamba") +class JambaToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + raise ValueError( + "Detected a MistralTokenizer tokenizer when using a Jamba model" + ) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: List[Dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: List[str] = [ + ] # map what has been streamed for each tool so far to a list + + self.tool_calls_start_token: str = "" + self.tool_calls_end_token: str = "" + + self.tool_calls_regex = re.compile( + rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}", + re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + self.tool_calls_start_token_id = self.vocab.get( + self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get( + self.tool_calls_end_token) + if (self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None): + raise RuntimeError( + "Jamba Tool parser could not locate tool calls start/end " + "tokens in the tokenizer!") + + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because jamba use the special + # tokens to indicate the start and end of the tool calls + # information. + request.skip_special_tokens = False + return request + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + else: + + try: + # use a regex to find the tool call between the tags + function_calls = self.tool_calls_regex.findall(model_output)[0] + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = json.loads(function_calls) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"]))) + for function_call in raw_function_calls + ] + + content = model_output[:model_output. + find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if + (len(content) > 0 and content != " ") else None) + + except Exception: + logger.exception( + "Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + # if the tool call token is not in the tokens generated so far, append + # output to contents since it's not a tool + if self.tool_calls_start_token not in current_text: + return DeltaMessage(content=delta_text) + + # if the tool call token ID IS in the tokens generated so far, that + # means we're parsing as tool calls now + + # handle if we detected the start of tool calls token which means + # the start of tool calling + if (self.tool_calls_start_token_id in delta_token_ids + and len(delta_token_ids) == 1): + # if it's the only token, return None, so we don't send a chat + # completion and don't send a control token + return None + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + + # Extract the tool calls between the special tool call tokens + parsable_arr = current_text.split( + self.tool_calls_start_token)[-1].split( + self.tool_calls_end_token)[0] + + # tool calls are generated in an array, so do partial JSON + # parsing on the entire array + try: + tool_call_arr: List[Dict] = partial_json_parser.loads( + parsable_arr, flags) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # select as the current tool call the one we're on the state at + + current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + if len(tool_call_arr) > 0 else {} + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif (len(tool_call_arr) > 0 + and len(tool_call_arr) > self.current_tool_id + 1): + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + diff: Union[str, None] = current_tool_call.get("arguments") + + if diff: + diff = json.dumps(diff).replace( + self.streamed_args_for_tool[self.current_tool_id], + "") + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # case: update an existing tool - this is handled below + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + cur_arguments = current_tool_call.get("arguments") + + new_text = delta_text.replace("\'", "\"") + + if not cur_arguments and not prev_arguments: + + delta = None + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset " + "mid-arguments") + delta = None + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments) + logger.debug("finding %s in %s", new_text, + cur_arguments_json) + + arguments_delta = cur_arguments_json[:cur_arguments_json. + index(new_text) + + len(new_text)] + logger.debug("First tokens in arguments received: %s", + arguments_delta) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += arguments_delta + + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments) + prev_args_json = json.dumps(prev_arguments) + logger.debug("Searching for diff between \n%s\n%s", + cur_args_json, prev_args_json) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json) + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + else: + # try parsing it with regular JSON - if it works we're + # at the end, and we need to send the difference between + # tokens streamed so far and the valid JSON + delta = None + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index ff4e88f29d39e..f5c0d92f3f9bd 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -63,7 +63,7 @@ def __init__(self, tokenizer: AnyTokenizer): self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) - if not self.bot_token_id: + if self.bot_token_id is None: raise RuntimeError( "Mistral Tool Parser could not locate the tool call token in " "the tokenizer!") diff --git a/vllm/envs.py b/vllm/envs.py index 2396e87e20c39..385db82d89249 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -32,6 +32,7 @@ VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: bool = False VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False + VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" @@ -286,6 +287,11 @@ def get_default_config_root(): "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))), + # If set, vllm will force flashinfer to use tensor cores; + # otherwise will use heuristic based on model architecture. + "VLLM_FLASHINFER_FORCE_TENSOR_CORES": + lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))), + # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index cf99306c9caef..8de3385a257f8 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -264,6 +264,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): lambda: nn.ReLU(), "relu2": lambda: ReLUSquaredActivation(), + "silu": + lambda: nn.SiLU(), "quick_gelu": lambda: QuickGELU(), }) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 10fae84dab723..30b43f375dd5c 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -27,7 +27,6 @@ def __init__( self.variance_epsilon = eps self.variance_size_override = (None if var_hidden_size == hidden_size else var_hidden_size) - self.weight = nn.Parameter(torch.ones(hidden_size)) def forward_native( diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 76ccb3dfe0a65..3455a4ccf282f 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -12,6 +12,7 @@ class PoolingType(IntEnum): """Enumeration for different types of pooling methods.""" LAST = 0 ALL = 1 + CLS = 2 class Pooler(nn.Module): @@ -23,12 +24,13 @@ class Pooler(nn.Module): 3. Returns structured results as `PoolerOutput`. Attributes: - pooling_type: The type of pooling to use (LAST, AVERAGE, MAX). + pooling_type: The type of pooling to use (LAST, ALL, CLS). normalize: Whether to normalize the pooled data. """ def __init__(self, pooling_type: PoolingType, normalize: bool): super().__init__() + self.pooling_type = pooling_type self.normalize = normalize @@ -38,10 +40,16 @@ def forward( pooling_metadata: PoolingMetadata, ) -> PoolerOutput: """Pools specific information from hidden states based on metadata.""" + prompt_lens = PoolingTensors.from_pooling_metadata( pooling_metadata, hidden_states.device).prompt_lens - if self.pooling_type == PoolingType.LAST: + if self.pooling_type is PoolingType.CLS: + first_token_flat_indices = torch.zeros_like(prompt_lens) + first_token_flat_indices[1:] += torch.cumsum(prompt_lens, + dim=0)[:-1] + pooled_data = hidden_states[first_token_flat_indices] + elif self.pooling_type == PoolingType.LAST: last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1 pooled_data = hidden_states[last_token_flat_indices] elif self.pooling_type == PoolingType.ALL: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index a371f1f4ad2cb..ecc345f116c37 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -100,12 +100,21 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": target_scheme_map[target][ "weights"] = QuantizationArgs.parse_obj( quant_config.get("weights")) - try: - target_scheme_map[target][ - "input_activations"] = QuantizationArgs.parse_obj( - quant_config.get("input_activations")) - except Exception: - target_scheme_map[target]["input_activations"] = None + + target_scheme_map[target]["input_activations"] = None + if is_activation_quantization_format(quant_format): + input_activations = quant_config.get("input_activations") + # The only case where we have activation quant supported + # but no input_activations provided in the config + # should be w8a16fp8 w8a16fp8 can also run for cases where + # there is an input_quant but it is ignored + if not input_activations: + assert target_scheme_map[target][ + "weights"].type == QuantizationType.FLOAT + else: + target_scheme_map[target][ + "input_activations"] = QuantizationArgs.parse_obj( + quant_config.get("input_activations")) return cls(target_scheme_map=target_scheme_map, ignore=ignore, @@ -244,8 +253,6 @@ def _get_scheme_from_parts( group_size=weight_quant.group_size, actorder=weight_quant.actorder) - # Detect If Activation Quantization. - # TODO @dsikka: clean-up conditions if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( @@ -256,16 +263,19 @@ def _get_scheme_from_parts( is_static_input_scheme=(input_quant and not input_quant.dynamic)) else: + # note: input_quant will be present for converted models; + # will be ignored during inference post loading return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(input_quant - and not input_quant.dynamic)) + is_static_input_scheme=not input_quant.dynamic) + # note: input_quant can be None if self._is_fp8_w8a16(weight_quant, input_quant): + is_static_input_scheme = (input_quant + and not input_quant.dynamic) return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(input_quant - and not input_quant.dynamic)) + is_static_input_scheme=is_static_input_scheme) if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8( diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index 00c82fb77186c..a9f1e6e88d792 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -6,7 +6,6 @@ import torch import torch.nn as nn -import transformers from transformers import PretrainedConfig from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig @@ -108,39 +107,11 @@ def load_weights(self, model_name_or_path: str, **kwargs): neuronx_module = importlib.import_module(neuronx_module_path) neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - split_model_dir = f"{model_name_or_path}-split" - if _is_pretrained_neuron_checkpoint(model_name_or_path): - split_model_dir = model_name_or_path - elif not os.path.exists(f"{model_name_or_path}-split"): - hf_model_cls = getattr(transformers, hf_model_cls_name) - from transformers_neuronx.module import save_pretrained_split - - hf_model = hf_model_cls.from_pretrained(model_name_or_path, - low_cpu_mem_usage=True) - save_pretrained_split(hf_model, f"{model_name_or_path}-split") - - self.model = neuronx_model_cls.from_pretrained(split_model_dir, + self.model = neuronx_model_cls.from_pretrained(model_name_or_path, **kwargs) self.model.to_neuron() -def _is_pretrained_neuron_checkpoint(model_name_or_path: str) -> bool: - # Checking if the neuron checkpoint is saved in the old format. - if os.path.isdir(os.path.join(model_name_or_path, "pytorch_model.bin")): - return True - # Checking if the neuron checkpoint is saved in the new format. - pretrained_split_files = ["config.json", "generation_config.json"] - pretrained_split_format = ".safetensors" - for file in pretrained_split_files: - file_path = os.path.join(model_name_or_path, file) - if not os.path.isfile(file_path): - return False - for file in os.listdir(model_name_or_path): - if file.endswith(pretrained_split_format): - return True - return False - - def _get_model_architecture(config: PretrainedConfig) -> str: architectures = getattr(config, "architectures", []) for arch in architectures: diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py new file mode 100644 index 0000000000000..4c0a0e303e655 --- /dev/null +++ b/vllm/model_executor/models/bert.py @@ -0,0 +1,419 @@ +from typing import Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import BertConfig + +from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.attention.backends.xformers import XFormersImpl +from vllm.config import CacheConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.sequence import IntermediateTensors, PoolerOutput + + +class BertEmbedding(nn.Module): + + def __init__(self, config: BertConfig): + + super().__init__() + self.size = config.hidden_size + self.word_embeddings = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.position_embeddings = VocabParallelEmbedding( + config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = VocabParallelEmbedding( + config.type_vocab_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.position_ids = nn.Parameter( + torch.empty((1, config.max_position_embeddings)), ) + + self.position_embedding_type = config.position_embedding_type + if self.position_embedding_type != "absolute": + raise ValueError("Only 'absolute' position_embedding_type" + + " is supported") + + def forward( + self, + input_ids: torch.Tensor, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + input_shape = input_ids.size() + + # Input embeddings. + inputs_embeds = self.word_embeddings(input_ids) + + # Position embeddings. + position_embeddings = self.position_embeddings(position_ids) + + # Token type embeddings. (TODO: move off hotpath?) + token_type_embeddings = self.token_type_embeddings( + torch.zeros(input_shape, + dtype=torch.long, + device=inputs_embeds.device)) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) + return embeddings + + +class BertEncoder(nn.Module): + + def __init__(self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.layer = nn.ModuleList([ + BertLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.layer.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) + ]) + + def forward( + self, + hidden_states: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + for i in range(len(self.layer)): + layer = self.layer[i] + hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.attention = BertAttention( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + layer_norm_eps=config.layer_norm_eps, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attention") + + self.intermediate = BertIntermediate( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.intermediate") + + self.output = BertOutput(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + layer_norm_eps=config.layer_norm_eps, + quant_config=quant_config, + prefix=f"{prefix}.output") + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor], + attn_metadata: AttentionMetadata, + ): + attn_output = self.attention(hidden_states, kv_cache, attn_metadata) + intermediate_output = self.intermediate(attn_output) + output = self.output(intermediate_output, attn_output) + return output + + +class BertAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + layer_norm_eps: float, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + self.self = BertSelfAttention(hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.output") + + self.output = BertSelfOutput(hidden_size=hidden_size, + layer_norm_eps=layer_norm_eps, + quant_config=quant_config, + prefix=f"{prefix}.output") + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + self_output = self.self(hidden_states, kv_cache, attn_metadata) + return self.output(self_output, hidden_states) + + +class BertSelfAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + + self.total_num_heads = num_attention_heads + assert self.total_num_heads % tp_size == 0 + + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = self.total_num_heads + self.head_dim = self.hidden_size // self.total_num_heads + assert self.head_dim * self.total_num_heads == self.hidden_size + + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.qkv_proj = QKVParallelLinear( + hidden_size=self.hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") + + self.attn = Attention(num_heads=self.num_heads, + head_size=self.head_dim, + scale=self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + if not isinstance(self.attn.impl, XFormersImpl): + raise ValueError( + "Encoder-only models currently require XFORMERS attention " + "backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.") + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=AttentionType.ENCODER_ONLY) + return output + + +class BertSelfOutput(nn.Module): + + def __init__(self, + hidden_size: int, + layer_norm_eps: float, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.dense = RowParallelLinear(input_size=hidden_size, + output_size=hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.dense") + self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, + input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.dense(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertIntermediate(nn.Module): + + def __init__(self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.dense = ColumnParallelLinear(input_size=hidden_size, + output_size=intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.dense") + self.intermediate_act_fn = get_act_fn(hidden_act) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, + hidden_size: int, + intermediate_size: int, + layer_norm_eps: float, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.dense = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.dense") + + self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, + input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.dense(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertModel(nn.Module): + + def __init__(self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.embeddings = BertEmbedding(config) + self.encoder = BertEncoder(config, + cache_config, + quant_config, + prefix=f"{prefix}.encoder") + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embeddings(input_ids=input_ids, + position_ids=position_ids) + + return self.encoder(hidden_states, kv_caches, attn_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "query", "q"), + ("qkv_proj", "key", "k"), + ("qkv_proj", "value", "v"), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "pooler" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +class BertEmbeddingModel(nn.Module): + """A model that uses Bert to provide embedding functionalities. + + This class encapsulates the BertModel and provides an interface for + embedding operations and customized pooling functions. + + Attributes: + model: An instance of BertModel used for forward operations. + _pooler: An instance of Pooler used for pooling operations. + """ + + def __init__( + self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.model = BertModel(config, cache_config, quant_config) + self._pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return self.model(input_ids=input_ids, + position_ids=positions, + kv_caches=kv_caches, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors, + attn_metadata=attn_metadata) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + self.model.load_weights(weights) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index fd2827c0eff09..a83b7d05df7aa 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,7 +5,8 @@ import torch import torch.nn as nn from PIL import Image -from transformers import CLIPVisionConfig, LlavaConfig, SiglipVisionConfig +from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, + SiglipVisionConfig) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig @@ -22,6 +23,10 @@ dummy_seq_data_for_clip, get_max_clip_image_tokens, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP +from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, + dummy_seq_data_for_pixtral_hf, + get_max_pixtral_hf_image_tokens, + input_processor_for_pixtral_hf) from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens, input_processor_for_siglip) @@ -31,8 +36,13 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: torch.Tensor - """Shape: `(batch_size * num_images, num_channels, height, width)`""" + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + + Note that `height` or `width` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ class LlavaImageEmbeddingInputs(TypedDict): @@ -77,6 +87,8 @@ def get_max_llava_image_tokens(ctx: InputContext): num_image_tokens = get_max_clip_image_tokens(vision_config) elif isinstance(vision_config, SiglipVisionConfig): num_image_tokens = get_max_siglip_image_tokens(vision_config) + elif isinstance(vision_config, PixtralVisionConfig): + num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config) else: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -120,6 +132,17 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, mm_data = dummy_image_for_siglip(vision_config, num_images) return seq_data, mm_data + elif isinstance(vision_config, PixtralVisionConfig): + seq_data = dummy_seq_data_for_pixtral_hf( + vision_config, + seq_len, + num_images, + image_token_id=hf_config.image_token_index, + image_feature_size_override=image_feature_size, + ) + + mm_data = dummy_image_for_pixtral_hf(vision_config, num_images) + return seq_data, mm_data msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -163,6 +186,15 @@ def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs): image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) + elif isinstance(vision_config, PixtralVisionConfig): + # We ignore image_feature_size_override since we have non-uniform + # image sizes for Pixtral + return input_processor_for_pixtral_hf( + model_config, + vision_config, + inputs, + image_token_id=hf_config.image_token_index, + ) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -189,6 +221,9 @@ def _init_vision_tower(hf_config: LlavaConfig): vision_config, num_hidden_layers_override=num_hidden_layers, ) + elif isinstance(vision_config, PixtralVisionConfig): + # TODO: allow layer override? + return PixtralHFVisionModel(vision_config) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -210,6 +245,15 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config + # NOTE: These are special cases for Pixtral-12B in the HF-format + # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json # noqa + if (config.text_config.architectures is None + and config.text_config.model_type == "mistral"): + config.text_config.architectures = ["MistralForCausalLM"] + if (config.projector_hidden_act is None + and config.vision_config.hidden_act == "gelu"): + config.projector_hidden_act = "gelu" + # TODO: Optionally initializes this for supporting embeddings. self.vision_tower = _init_vision_tower(config) self.multi_modal_projector = LlavaMultiModalProjector( @@ -246,6 +290,7 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) + image_sizes = kwargs.pop("image_sizes", None) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: @@ -256,6 +301,26 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + # Case for models like PixtralHF that have dynamic image sizes + # so we need to produce a list of tensors + if image_sizes is not None: + images = pixel_values + if isinstance(images, torch.Tensor): + # if passed as batch take all images + NN, N, B, C, W, H = images.shape + images = images.reshape(NN * N * B, C, W, H) + images = [images[i] for i in range(images.size(0))] + elif isinstance(images, list): + # if passed as list flatten lists of tensors + while isinstance(images, list) and len(images) == 1: + images = images[0] + + # TODO: Add validation based on image_sizes + return LlavaImagePixelInputs( + type="pixel_values", + data=images, + ) + return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -286,7 +351,8 @@ def _select_image_features(self, image_features: torch.Tensor, *, def _image_pixels_to_features( self, - vision_tower: Union[CLIPVisionModel, SiglipVisionModel], + vision_tower: Union[CLIPVisionModel, SiglipVisionModel, + PixtralHFVisionModel], pixel_values: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 7f2efb9895f25..9f4f391a6682e 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -22,7 +22,7 @@ QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( composed_weight_loader, default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.interfaces import (HasInnerState, @@ -59,7 +59,7 @@ def __init__(self, config: MambaConfig, layer_idx): self.conv_kernel_size = config.conv_kernel self.intermediate_size = config.intermediate_size self.time_step_rank = int(config.time_step_rank) - + self.is_falcon_mamba = config.model_type == "falcon_mamba" self.conv1d = ColumnParallelLinear( input_size=self.conv_kernel_size, output_size=self.intermediate_size, @@ -109,6 +109,13 @@ def __init__(self, config: MambaConfig, layer_idx): input_is_parallel=True, ) self.activation = config.hidden_act + if self.is_falcon_mamba: + self.dt_layernorm = RMSNorm(self.time_step_rank, + eps=config.mixer_rms_eps) + self.b_layernorm = RMSNorm(self.ssm_state_size, + eps=config.mixer_rms_eps) + self.c_layernorm = RMSNorm(self.ssm_state_size, + eps=config.mixer_rms_eps) def forward(self, hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, @@ -158,8 +165,12 @@ def forward(self, hidden_states: torch.Tensor, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1, ) - - # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't. + # Note that Jamba and FalconMamba normalizes B, C, and time_step here + # but Mamba doesn't. + if self.is_falcon_mamba: + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) # 3.c perform the recurrence y ← SSM(A, B, C)(x) @@ -213,11 +224,9 @@ def __init__(self, super().__init__() self.layer_idx = layer_idx self.config = config + self.is_falcon_mamba = config.model_type == "falcon_mamba" self.mixer = MambaMixer(config, layer_idx) - self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - self.pre_ff_layernorm = RMSNorm(config.hidden_size, - eps=config.layer_norm_epsilon) def forward( self, @@ -319,8 +328,18 @@ def __init__( self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - self.lm_head = self.backbone.embeddings + if config.tie_word_embeddings: + self.lm_head = self.backbone.embeddings + else: + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) # Used to track and store by the Mamba cache between steps. self.mamba_cache: Optional[MambaCacheManager] = None @@ -398,7 +417,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") - # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f34d21fdef56f..13c5149a63919 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -3,18 +3,25 @@ from itertools import tee from typing import Iterable, List, Mapping, Optional, Tuple, Union +import numpy import torch import torch.nn as nn import torch.nn.functional as F from mistral_common.protocol.instruct.messages import ImageChunk from PIL import Image -from transformers import PretrainedConfig +from transformers import PixtralVisionConfig, PretrainedConfig +from transformers.models.pixtral.image_processing_pixtral import ( + _num_image_tokens) +from transformers.models.pixtral.modeling_pixtral import ( + PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid) from xformers.ops.fmha import memory_efficient_attention from xformers.ops.fmha.attn_bias import BlockDiagonalMask from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.config import CacheConfig, ModelConfig, MultiModalConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, + token_inputs) +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput @@ -25,6 +32,8 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import init_vllm_registered_model @@ -576,3 +585,379 @@ def __init__(self, args: VisionEncoderArgs, dim: int): def forward(self, x: torch.Tensor) -> torch.Tensor: return self.w_out(self.gelu(self.w_in(x))) + + +#### HF Transformers version of Pixtral #### +# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py +# This model follows the Llava family, meaning image embeddings are placed +# instead of the `[IMG]` token placeholders. +# The model uses [`PixtralVisionModel`] for its vision encoder, +# and [`MistralForCausalLM`] for its language decoder. + + +def get_pixtral_hf_patch_grid_length(*, image_size: int, + patch_size: int) -> int: + # Since interpolation is applied, the image size need not be divisible + # assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_max_pixtral_hf_image_feature_size( + hf_config: PixtralVisionConfig) -> int: + return get_pixtral_hf_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: + return get_max_pixtral_hf_image_feature_size(hf_config) + + +def dummy_seq_data_for_pixtral_hf( + hf_config: PixtralVisionConfig, + seq_len: int, + num_images: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + return SequenceData.from_prompt_token_counts( + (image_token_id, image_feature_size * num_images), + (0, seq_len - image_feature_size * num_images), + ) + + +def dummy_image_for_pixtral_hf( + hf_config: PixtralVisionConfig, + num_images: int, + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image if num_images == 1 else [image] * num_images} + + +def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, + image_width: int, + image_height: int) -> Tuple[int, int]: + # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 + # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501 + max_width, max_height = hf_config.image_size, hf_config.image_size + patch_width, patch_height = hf_config.patch_size, hf_config.patch_size + + ratio = max(image_width / max_width, image_height / max_height) + + if ratio > 1: + image_width = int(numpy.ceil(image_width / ratio)) + image_height = int(numpy.ceil(image_height / ratio)) + + num_height_tokens, num_width_tokens = _num_image_tokens( + (image_height, image_width), (patch_height, patch_width)) + + return num_width_tokens, num_height_tokens + + +def input_processor_for_pixtral_hf( + model_config: ModelConfig, + hf_config: PixtralVisionConfig, + inputs: DecoderOnlyInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[Union[int, List[int]]] = None, +) -> DecoderOnlyInputs: + assert image_feature_size_override is None, ( + "image_feature_size_override is not supported for Pixtral") + + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + processor = cached_get_processor(model_config.model) + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_data = [image_data] + elif not is_list_of(image_data, Image.Image): + raise TypeError(f"Invalid image type: {type(image_data)}") + + new_prompt = inputs.get("prompt") + new_token_ids = inputs["prompt_token_ids"] + + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token + + # Update new_prompt if present + if new_prompt: + parts = new_prompt.split(image_token) + assert len(parts) - 1 == len(image_data) + new_parts = [parts[0]] # Start with the part before any image tokens + + for image, next_part in zip(image_data, parts[1:]): + w, h = image.size + (num_width_tokens, + num_height_tokens) = get_pixtral_hf_image_feature_size( + hf_config, image_width=w, image_height=h) + + replace_tokens = [image_token] * num_width_tokens + [ + image_break_token + ] + replace_tokens = replace_tokens * num_height_tokens + replace_tokens[-1] = image_end_token + + new_parts.append("".join(replace_tokens)) + new_parts.append(next_part) + + new_prompt = "".join(new_parts) + + # Update new_token_ids + convert_tokens_to_ids = processor.tokenizer.convert_tokens_to_ids + image_token_id = convert_tokens_to_ids(image_token) + image_break_id = convert_tokens_to_ids(image_break_token) + image_end_id = convert_tokens_to_ids(image_end_token) + placeholder_token_id = -999 + # Find all image token indices at once + placeholder_indices = [ + idx for idx, token_id in enumerate(new_token_ids) + if token_id == image_token_id + ] + assert len(placeholder_indices) == len(image_data) + replace_tokens_list = [] + for placeholder_idx, image in zip(placeholder_indices, image_data): + new_token_ids[placeholder_idx] = placeholder_token_id + + w, h = image.size + (num_width_tokens, + num_height_tokens) = get_pixtral_hf_image_feature_size(hf_config, + image_width=w, + image_height=h) + + replace_tokens = [image_token_id] * num_width_tokens + [image_break_id] + replace_tokens = replace_tokens * num_height_tokens + replace_tokens[-1] = image_end_id + replace_tokens_list.append(replace_tokens) + + # Backward iteration for replacement without affecting known indices + for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices), + reversed(replace_tokens_list)): + new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens + + # NOTE: Create a defensive copy of the original inputs + return token_inputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +class PixtralHFMLP(nn.Module): + + def __init__(self, config: PixtralVisionConfig): + super().__init__() + assert config.intermediate_size is not None + self.gate_proj = nn.Linear(config.hidden_size, + config.intermediate_size, + bias=False) + self.up_proj = nn.Linear(config.hidden_size, + config.intermediate_size, + bias=False) + self.down_proj = nn.Linear(config.intermediate_size, + config.hidden_size, + bias=False) + self.act = get_act_fn(config.hidden_act) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) + + +class PixtralHFAttention(nn.Module): + + def __init__(self, config: PixtralVisionConfig): + super().__init__() + self.config = config + assert not config.hidden_size % config.num_attention_heads + self.n_heads = config.num_attention_heads + self.head_dim = config.hidden_size // config.num_attention_heads + + self.scale = self.head_dim**-0.5 + + self.q_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.k_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.v_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.o_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + batch, patches, _ = hidden_states.size() + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + # Transpose q and k to apply HF's Rotary Position Embedding + q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + cos, sin = position_embeddings + q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) + + # Transpose q and k back for attention + q = q.transpose(1, 2).contiguous() + k = k.transpose(1, 2).contiguous() + v = v.reshape(batch, patches, self.n_heads, self.head_dim) + + out = memory_efficient_attention(q, k, v, attn_bias=attention_mask) + out = out.reshape(batch, patches, self.n_heads * self.head_dim) + + return self.o_proj(out) + + +class PixtralHFTransformerBlock(nn.Module): + + def __init__(self, config: PixtralVisionConfig): + super().__init__() + self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5) + self.attention = PixtralHFAttention(config) + self.feed_forward = PixtralHFMLP(config) + self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> torch.Tensor: + r = self.attention.forward(self.attention_norm(hidden_states), + attention_mask=attention_mask, + position_embeddings=position_embeddings) + h = hidden_states + r + r = self.feed_forward.forward(self.ffn_norm(h)) + out = h + r + return out + + +class PixtralHFTransformer(nn.Module): + + def __init__(self, config: PixtralVisionConfig): + super().__init__() + self.layers = torch.nn.ModuleList() + for _ in range(config.num_hidden_layers): + self.layers.append(PixtralHFTransformerBlock(config)) + + def forward( + self, + x: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> torch.Tensor: + for layer in self.layers: + x = layer(x, attention_mask, position_embeddings) + return x + + +class PixtralHFVisionModel(nn.Module): + + def __init__(self, config: PixtralVisionConfig): + super().__init__() + + self.config = config + self.patch_conv = nn.Conv2d( + in_channels=config.num_channels, + out_channels=config.hidden_size, + kernel_size=config.patch_size, + stride=config.patch_size, + bias=False, + ) + self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5) + self.transformer = PixtralHFTransformer(config) + self.dtype = next(self.parameters()).dtype + self.device = next(self.parameters()).device + self.patch_positional_embedding = PixtralRotaryEmbedding( + config, self.device) + + def forward( + self, + pixel_values: List[torch.Tensor], + ) -> torch.Tensor: + """ + Args: + pixel_values: tensor of token features for + all tokens of all images of shape (N_toks, D) + Returns: + image_features: tensor of token features for + all tokens of all images of shape (N_toks, D) + """ + # pass images through initial convolution independently + patch_embeds_list = [ + self.patch_conv( + img.reshape(-1, img.shape[-3], img.shape[-2], + img.shape[-1]).to(self.dtype)) + for img in pixel_values + ] + + # flatten to a single sequence + patch_embeds = torch.cat( + [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1) + patch_embeds = self.ln_pre(patch_embeds) + + # positional embeddings + position_ids = position_ids_in_meshgrid( + patch_embeds_list, + max_width=self.config.image_size // self.config.patch_size).to( + self.device) + + position_embedding = self.patch_positional_embedding( + patch_embeds, position_ids) + attention_mask = BlockDiagonalMask.from_seqlens( + [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], ) + out = self.transformer(patch_embeds, attention_mask, + position_embedding) + + return out + + # (TODO) Add prefix argument for filtering out weights to be loaded + # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index eb9a9aa9364cc..cb04cc4850951 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -364,6 +364,14 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ] embedding_modules = {} embedding_padding_modules = [] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } def __init__( self, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f7d632a83cc33..a3540abdc23d3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,7 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -from functools import lru_cache, partial +from functools import partial from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, Tuple, Type, TypedDict, Union) @@ -63,7 +63,7 @@ from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import get_processor +from vllm.transformers_utils.processor import cached_get_processor from .interfaces import SupportsMultiModal, SupportsPP from .utils import (PPMissingLayer, get_vit_attn_backend, @@ -544,8 +544,6 @@ def forward( # === Vision input helpers === # -cached_get_processor = lru_cache(get_processor) - def mm_input_mapper_for_qwen2_vl( ctx: InputContext, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 03a67e3712d72..2a04ece24c8bd 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -53,6 +53,7 @@ # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), + "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), @@ -87,6 +88,7 @@ _EMBEDDING_MODELS = { # [Text-only] + "BertModel": ("bert", "BertEmbeddingModel"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), diff --git a/vllm/outputs.py b/vllm/outputs.py index 15cb8d53186df..07650241cb638 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -4,7 +4,6 @@ from typing import Sequence as GenericSequence from typing import Union -from vllm.inputs import PromptType from vllm.lora.request import LoRARequest from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, @@ -93,7 +92,7 @@ class RequestOutput: def __init__( self, request_id: str, - prompt: Optional[PromptType], + prompt: Optional[str], prompt_token_ids: Optional[List[int]], prompt_logprobs: Optional[PromptLogprobs], outputs: List[CompletionOutput], diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 4f2ae75e65f3a..9993cec13d649 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -49,14 +49,17 @@ class GuidedDecodingParams: @staticmethod def from_optional( - json: Optional[Union[Dict, BaseModel, str]], + json: Optional[Union[Dict, BaseModel, str]] = None, regex: Optional[str] = None, choice: Optional[List[str]] = None, grammar: Optional[str] = None, json_object: Optional[bool] = None, backend: Optional[str] = None, whitespace_pattern: Optional[str] = None, - ) -> "GuidedDecodingParams": + ) -> Optional["GuidedDecodingParams"]: + if all(arg is None + for arg in (json, regex, choice, grammar, json_object)): + return None # Extract json schemas from pydantic models if isinstance(json, (BaseModel, type(BaseModel))): json = json.model_json_schema() diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index aaf6ec5f508c8..3aa999fcb9ebb 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -179,7 +179,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add support for other attn backends - if self.attn_backend.get_name() != "flash-attn": + if self.attn_backend.get_name() != "FLASH_ATTN": return False # TODO: Add support for LORA diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 50d2767a03752..316db43502d3b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -184,7 +184,7 @@ def create_worker( if not disable_mqa_scorer: if scorer_worker.model_runner.attn_backend.get_name( - ) != "flash-attn": + ) != "FLASH_ATTN": disable_mqa_scorer = True logger.info( "[Speculative Decoding] Disabling MQA scorer as the " diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 98663f7f0bd07..f1523667b0466 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,3 +1,4 @@ +from functools import lru_cache from typing import Any, cast @@ -37,6 +38,9 @@ def get_processor( return cast(ProcessorMixin, processor) +cached_get_processor = lru_cache(get_processor) + + def get_image_processor( processor_name: str, *args: Any, diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index dcb5cf216c996..23ea657ffb0a9 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -2,11 +2,12 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast +import huggingface_hub from huggingface_hub import HfApi, hf_hub_download +from mistral_common.protocol.instruct.request import ChatCompletionRequest # yapf: disable -from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest from mistral_common.tokens.tokenizers.mistral import ( MistralTokenizer as PublicMistralTokenizer) # yapf: enable @@ -24,6 +25,26 @@ class Encoding: input_ids: List[int] +def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: + repo_cache = os.path.join( + huggingface_hub.constants.HF_HUB_CACHE, + huggingface_hub.constants.REPO_ID_SEPARATOR.join( + ["models", *repo_id.split("/")])) + + if revision is None: + revision_file = os.path.join(repo_cache, "refs", "main") + if os.path.isfile(revision_file): + with open(revision_file) as file: + revision = file.read() + + if revision: + revision_dir = os.path.join(repo_cache, "snapshots", revision) + if os.path.isdir(revision_dir): + return os.listdir(revision_dir) + + return [] + + def find_tokenizer_file(files: List[str]): file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$") @@ -90,9 +111,16 @@ def from_pretrained(cls, @staticmethod def _download_mistral_tokenizer_from_hf(tokenizer_name: str, revision: Optional[str]) -> str: - api = HfApi() - repo_info = api.model_info(tokenizer_name) - files = [s.rfilename for s in repo_info.siblings] + try: + hf_api = HfApi() + files = hf_api.list_repo_files(repo_id=tokenizer_name, + revision=revision) + except ConnectionError as exc: + files = list_local_repo_files(repo_id=tokenizer_name, + revision=revision) + + if len(files) == 0: + raise exc filename = find_tokenizer_file(files) @@ -166,7 +194,7 @@ def apply_chat_template(self, tools: Optional[Dict[str, Any]] = None, **kwargs) -> List[int]: - last_message = messages[-1] + last_message = cast(Dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True diff --git a/vllm/utils.py b/vllm/utils.py index 07769da3c86d4..0147d595fec70 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1034,10 +1034,54 @@ def identity(value: T) -> T: F = TypeVar('F', bound=Callable[..., Any]) +def deprecate_args( + start_index: int, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + params = inspect.signature(fn).parameters + pos_types = ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + pos_kws = [ + kw for kw, param in params.items() if param.kind in pos_types + ] + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_args = pos_kws[start_index:len(args)] + if deprecated_args: + msg = ( + f"The positional arguments {deprecated_args} are " + "deprecated and will be removed in a future update.") + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper + + def deprecate_kwargs( - *kws: str, - is_deprecated: Union[bool, Callable[[], bool]] = True, - additional_message: Optional[str] = None) -> Callable[[F], F]: + *kws: str, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: deprecated_kws = set(kws) if not callable(is_deprecated): diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 090f95e6e892c..ac3270d1c9909 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -53,7 +53,6 @@ def __init__( # Get attention backend. self.attn_backend = get_attn_backend(self.head_size, - model_config.get_sliding_window(), model_config.dtype, cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index dd38b550eb011..5032896600b3b 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -420,7 +420,6 @@ def __init__( self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index b84562851f0f8..ab93471b5af74 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -57,7 +57,6 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, # Get attention backend. self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a82956985af55..f98fb7e4f01df 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1011,7 +1011,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, @@ -1856,7 +1855,7 @@ def forward( self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) self.input_buffers["positions"].copy_(positions, non_blocking=True) - if self.backend_name != "placeholder-attn": + if self.backend_name != "NO_ATTENTION": self.input_buffers["slot_mapping"].copy_( attn_metadata.slot_mapping, non_blocking=True) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0cd0047bebf2d..be2f0d79154d6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -29,8 +29,8 @@ logger = init_logger(__name__) -MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"] -MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["flash-attn"] +MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ -> List[str]: diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 760b18427e22b..a164fbe3393c4 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -75,7 +75,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 24425fece850f..bc245d19663d6 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -71,7 +71,6 @@ def __init__( # Get attention backend. self.attn_backend = get_attn_backend( self.head_size, - self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index f7e5f660c0249..87ced7818a676 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -114,7 +114,6 @@ def __init__( dtype=np.int32) self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ab61e4377f900..fd30962e5d6bb 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -92,7 +92,7 @@ def __init__( ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner if model_runner_cls is not None: ModelRunnerClass = model_runner_cls - elif self._is_embedding_model(): + elif model_config.task == "embedding": ModelRunnerClass = EmbeddingModelRunner elif self._is_encoder_decoder_model(): ModelRunnerClass = EncoderDecoderModelRunner @@ -147,9 +147,6 @@ def stop_profile(self): def _is_encoder_decoder_model(self): return self.model_config.is_encoder_decoder_model - def _is_embedding_model(self): - return self.model_config.is_embedding_model - def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -217,42 +214,79 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() + torch.cuda.synchronize() + + self._assert_memory_footprint_increased_during_profiling() + + # Get the peak memory allocation recorded by torch + peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] + + # Check for any memory left around that may have been allocated on the + # gpu outside of `torch`. NCCL operations, for example, can use a few + # GB during a forward pass + torch.cuda.empty_cache() + torch_allocated_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + total_allocated_bytes = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + if non_torch_allocations > 0: + peak_memory += non_torch_allocations + + available_kv_cache_memory = ( + total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. - torch.cuda.synchronize() - free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( - "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - cache_block_size = self.get_cache_block_size_bytes() if cache_block_size == 0: num_gpu_blocks = 0 num_cpu_blocks = 0 else: - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) + num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) + + logger.info( + "Memory profiling results: total_gpu_memory=%.2fGiB" + " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" + " memory_usage_post_profile=%.2fGib" + " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" + " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), + (total_gpu_memory - free_memory_pre_profile) / (1024**3), + (peak_memory - non_torch_allocations) / (1024**3), + total_allocated_bytes / (1024**3), + non_torch_allocations / (1024**3), + available_kv_cache_memory / (1024**3), + self.cache_config.gpu_memory_utilization) + + # Final cleanup if self.model_runner.lora_manager: self.model_runner.remove_all_loras() gc.collect() - torch.cuda.empty_cache() + return num_gpu_blocks, num_cpu_blocks + def _assert_memory_footprint_increased_during_profiling(self): + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + free_gpu_memory, _ = torch.cuda.mem_get_info() + assert self.init_gpu_memory - free_gpu_memory > 0, ( + "Error in memory profiling. " + f"Initial free memory {self.init_gpu_memory}, current free memory" + f" {free_gpu_memory}. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Allocate GPU and CPU KV cache with the specified number of blocks. diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 5ff4626c060b3..75a6de3b24ba4 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -374,7 +374,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size,