From 487678d046fe56560ff5dc6c91c3f3c31af7de6f Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Oct 2024 10:14:27 +0800 Subject: [PATCH 01/12] [Bugfix][Hardware][CPU] Fix CPU model input for decode (#9044) --- vllm/worker/cpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index cebb0f36a2b28..534d167d994fe 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -133,7 +133,7 @@ def build(self) -> ModelInputForCPU: (input_tokens, input_positions, attn_metadata) = self._prepare_decode( self.seq_group_metadata_list) - seq_lens = [] + seq_lens = None return self.model_input_cls( input_tokens=input_tokens, From c8f26bb63694adb4202ab275efb0759c13edcaa8 Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Sun, 6 Oct 2024 20:52:42 -0700 Subject: [PATCH 02/12] [BugFix][Core] Fix BlockManagerV2 when Encoder Input is None (#9103) --- vllm/core/block/block_table.py | 2 -- vllm/core/block_manager_v2.py | 4 +++- vllm/engine/arg_utils.py | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index a9f4bd871dfda..d10cb29ef4a7c 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -220,7 +220,6 @@ def free(self) -> None: occupied by each block. After freeing all the blocks, the `_blocks` list is set to `None`. """ - assert self._is_allocated for block in self.blocks: self._allocator.free(block) self._blocks.reset() @@ -239,7 +238,6 @@ def physical_block_ids(self) -> List[int]: List[int]: A list of physical block indices for the blocks in the BlockTable. """ - assert self._is_allocated return self._blocks.ids() def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0fad5fa99daf8..c7ee6609306d7 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -151,7 +151,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) - block_table.allocate(seq.get_token_ids()) + if seq.get_token_ids(): + # Add blocks to the block table only if the sequence is non empty. + block_table.allocate(seq.get_token_ids()) return block_table diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1623ebb3aa74c..cae95d20ca23d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -903,11 +903,6 @@ def create_engine_config(self) -> EngineConfig: "--enable-prefix-caching is currently not " "supported for multimodal models and has been disabled.") self.enable_prefix_caching = False - if model_config.is_encoder_decoder_model: - logger.warning( - "Block Manager v2 does not support encoder-decoder models" - " currently. Using Block Manager v1 as fallback.") - self.use_v2_block_manager = False cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else From 18b296fdb2248e8a65bf005e7193ebd523b875b6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 6 Oct 2024 22:47:04 -0700 Subject: [PATCH 03/12] [core] remove beam search from the core (#9105) --- benchmarks/backend_request_func.py | 6 - benchmarks/benchmark_latency.py | 3 +- benchmarks/benchmark_prioritization.py | 24 ++- benchmarks/benchmark_serving.py | 7 - benchmarks/benchmark_throughput.py | 29 ++-- examples/llm_engine_example.py | 3 - examples/multilora_inference.py | 18 --- tests/basic_correctness/test_preemption.py | 114 +------------- tests/conftest.py | 14 -- tests/core/block/e2e/test_correctness.py | 67 -------- tests/core/utils.py | 7 +- tests/samplers/test_beam_search.py | 4 +- tests/samplers/test_sampler.py | 30 +--- vllm/core/scheduler.py | 4 +- vllm/engine/async_llm_engine.py | 16 +- vllm/engine/output_processor/single_step.py | 164 +------------------- vllm/entrypoints/llm.py | 13 +- vllm/entrypoints/openai/protocol.py | 10 +- vllm/envs.py | 5 - vllm/model_executor/layers/sampler.py | 9 +- vllm/outputs.py | 6 +- vllm/sampling_params.py | 73 +-------- vllm/sequence.py | 46 ++---- vllm/utils.py | 19 +++ vllm/worker/tpu_model_runner.py | 3 - 25 files changed, 98 insertions(+), 596 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index bcd38461617a8..4813fde27f0bc 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -23,7 +23,6 @@ class RequestFuncInput: output_len: int model: str best_of: int = 1 - use_beam_search: bool = False logprobs: Optional[int] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False @@ -49,7 +48,6 @@ async def async_request_tgi( assert api_url.endswith("generate_stream") async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -121,7 +119,6 @@ async def async_request_trt_llm( assert api_url.endswith("generate_stream") async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -187,7 +184,6 @@ async def async_request_deepspeed_mii( ) -> RequestFuncOutput: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 - assert not request_func_input.use_beam_search payload = { "prompt": request_func_input.prompt, @@ -235,7 +231,6 @@ async def async_request_openai_completions( ), "OpenAI Completions API URL must end with 'completions' or 'profile'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search payload = { "model": request_func_input.model, "prompt": request_func_input.prompt, @@ -317,7 +312,6 @@ async def async_request_openai_chat_completions( ), "OpenAI Chat Completions API URL must end with 'chat/completions'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert not request_func_input.use_beam_search content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index eadf994cacd34..938d7acd5687c 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -51,9 +51,8 @@ def main(args: argparse.Namespace): sampling_params = SamplingParams( n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=args.use_beam_search, ignore_eos=True, max_tokens=args.output_len, ) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 0ba29fabca59b..8843e3a927a01 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -68,7 +68,6 @@ def run_vllm( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -114,9 +113,8 @@ def run_vllm( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -144,15 +142,16 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm( - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.gpu_memory_utilization, - args.download_dir) + elapsed_time = run_vllm(requests, args.model, args.tokenizer, + args.quantization, args.tensor_parallel_size, + args.seed, args.n, args.trust_remote_code, + args.dtype, args.max_model_len, + args.enforce_eager, args.kv_cache_dtype, + args.quantization_param_path, args.device, + args.enable_prefix_caching, + args.enable_chunked_prefill, + args.max_num_batched_tokens, + args.gpu_memory_utilization, args.download_dir) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -203,7 +202,6 @@ def main(args: argparse.Namespace): type=int, default=1, help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--num-prompts", type=int, default=200, diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 0460f4c0094be..292d1f37fbf3e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -391,7 +391,6 @@ async def benchmark( input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], best_of: int, - use_beam_search: bool, request_rate: float, disable_tqdm: bool, profile: bool, @@ -419,7 +418,6 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) @@ -441,7 +439,6 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=test_mm_content, ) profile_output = await request_func(request_func_input=profile_input) @@ -464,7 +461,6 @@ async def benchmark( output_len=output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, multi_modal_content=mm_content, ) tasks.append( @@ -483,7 +479,6 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, best_of=best_of, - use_beam_search=use_beam_search, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -679,7 +674,6 @@ def main(args: argparse.Namespace): input_requests=input_requests, logprobs=args.logprobs, best_of=args.best_of, - use_beam_search=args.use_beam_search, request_rate=args.request_rate, disable_tqdm=args.disable_tqdm, profile=args.profile, @@ -701,7 +695,6 @@ def main(args: argparse.Namespace): result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id result_json["best_of"] = args.best_of - result_json["use_beam_search"] = args.use_beam_search result_json["num_prompts"] = args.num_prompts # Metadata diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c6bc607ff6b8e..3781863f77e64 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -73,7 +73,6 @@ def run_vllm( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -91,7 +90,6 @@ def run_vllm( download_dir: Optional[str] = None, load_format: str = EngineArgs.load_format, disable_async_output_proc: bool = False, - use_new_beam_search_impl: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -127,19 +125,19 @@ def run_vllm( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) - if not use_new_beam_search_impl: + use_beam_search = False + + if not use_beam_search: start = time.perf_counter() llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - assert use_beam_search prompts = [prompt for prompt, _, _ in requests] # output_len should be the same for all requests. output_len = requests[0][2] @@ -165,7 +163,6 @@ async def run_vllm_async( tensor_parallel_size: int, seed: int, n: int, - use_beam_search: bool, trust_remote_code: bool, dtype: str, max_model_len: Optional[int], @@ -224,9 +221,8 @@ async def run_vllm_async( sampling_params.append( SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=use_beam_search, ignore_eos=True, max_tokens=output_len, )) @@ -248,11 +244,9 @@ def run_hf( model: str, tokenizer: PreTrainedTokenizerBase, n: int, - use_beam_search: bool, max_batch_size: int, trust_remote_code: bool, ) -> float: - assert not use_beam_search llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": @@ -284,7 +278,7 @@ def run_hf( padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), - do_sample=not use_beam_search, + do_sample=True, num_return_sequences=n, temperature=1.0, top_p=1.0, @@ -340,7 +334,7 @@ def main(args: argparse.Namespace): if args.backend == "vllm": run_args = [ requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, + args.tensor_parallel_size, args.seed, args.n, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, @@ -355,12 +349,11 @@ def main(args: argparse.Namespace): run_args.append(args.disable_frontend_multiprocessing) elapsed_time = uvloop.run(run_vllm_async(*run_args)) else: - elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl) + elapsed_time = run_vllm(*run_args) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.use_beam_search, args.hf_max_batch_size, - args.trust_remote_code) + args.hf_max_batch_size, args.trust_remote_code) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) @@ -414,8 +407,6 @@ def main(args: argparse.Namespace): type=int, default=1, help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--use-new-beam-search-impl", action="store_true") parser.add_argument("--num-prompts", type=int, default=1000, @@ -570,8 +561,6 @@ def main(args: argparse.Namespace): raise ValueError("dtype must be auto for MII backend.") if args.n != 1: raise ValueError("n must be 1 for MII backend.") - if args.use_beam_search: - raise ValueError("Beam search is not supported for MII backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") if args.hf_max_batch_size is not None: diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index ca41f32b12b31..60d894aae9692 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -18,9 +18,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]: temperature=0.8, top_p=0.95, frequency_penalty=0.1)), - ("It is only with the heart that one can see rightly", - SamplingParams(n=3, best_of=3, use_beam_search=True, - temperature=0.0)), ] diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index 6aa25b4689ec8..043220d979c3c 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -43,15 +43,6 @@ def create_test_prompts( max_tokens=128, stop_token_ids=[32003]), LoRARequest("sql-lora", 1, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 SamplingParams(temperature=0.0, @@ -60,15 +51,6 @@ def create_test_prompts( max_tokens=128, stop_token_ids=[32003]), LoRARequest("sql-lora2", 2, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), ] diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 05e7859759002..4e502cfb5f4f8 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -23,11 +23,9 @@ @pytest.fixture(scope="module", autouse=True) def check_settings(): assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1, " - "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1. " + "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1." "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 " - "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 pytest " - "tests/basic_correctness/test_preemption.py`") + "pytest tests/basic_correctness/test_preemption.py`") @pytest.fixture @@ -137,114 +135,6 @@ def test_preemption( assert total_preemption == total_recorded_preemption -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("beam_width", [4]) -def test_swap( - caplog_vllm, - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - beam_width: int, - worker_use_ray: bool, -) -> None: - """Use beam search enables swapping.""" - example_prompts = example_prompts[:1] - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - - with vllm_runner( - model, - dtype=dtype, - swap_space=10, - disable_log_stats=False, - worker_use_ray=worker_use_ray, - ) as vllm_model: - vllm_outputs = vllm_model.generate_beam_search(example_prompts, - beam_width, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption) - - for i in range(len(example_prompts)): - hf_output_ids, _ = hf_outputs[i] - vllm_output_ids, _ = vllm_outputs[i] - assert len(hf_output_ids) == len(vllm_output_ids) - for j in range(len(hf_output_ids)): - assert hf_output_ids[j] == vllm_output_ids[j], ( - f"Test{i} output{j}:\nHF: {hf_output_ids}\n" - f"vLLM: {vllm_output_ids}") - - assert ("is preempted by PreemptionMode.SWAP mode because there " - "is not enough KV cache space." in caplog_vllm.text) - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - preemption_metrics = None - for m in REGISTRY.collect(): - if m.name == "vllm:num_preemptions": - preemption_metrics = m - assert preemption_metrics is not None - total_recorded_preemption = 0 - for sample in preemption_metrics.samples: - total_recorded_preemption += sample.value - assert total_preemption == total_recorded_preemption - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("beam_width", [4]) -@pytest.mark.parametrize("use_v2_block_manager", [True, False]) -def test_swap_infeasible( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - beam_width: int, - worker_use_ray: bool, - use_v2_block_manager: bool, -) -> None: - """Verify infeasible swap request will be ignored.""" - BLOCK_SIZE = 16 - prefill_blocks = 2 - decode_blocks = max_tokens // BLOCK_SIZE - example_prompts = example_prompts[:1] - with vllm_runner( - model, - dtype=dtype, - swap_space=10, - block_size=BLOCK_SIZE, - # Since beam search have more than 1 sequence, prefill + - # decode blocks are not enough to finish. - num_gpu_blocks_override=prefill_blocks + decode_blocks, - max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, - worker_use_ray=worker_use_ray, - use_v2_block_manager=use_v2_block_manager, - ) as vllm_model: - sampling_params = SamplingParams(n=beam_width, - use_beam_search=True, - temperature=0.0, - max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.model.generate( - example_prompts, - sampling_params=sampling_params, - ) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - # Verify the request is ignored and not hang. - assert req_outputs[0].outputs[0].finish_reason == "length" - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) diff --git a/tests/conftest.py b/tests/conftest.py index 5de3f1f2a2b90..713be09ca96ea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -782,7 +782,6 @@ def generate_encoder_decoder_greedy_logprobs( List[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( temperature=0.0, - use_beam_search=False, max_tokens=max_tokens, logprobs=num_logprobs, prompt_logprobs=(num_prompt_logprobs), @@ -795,19 +794,6 @@ def generate_encoder_decoder_greedy_logprobs( encoder_decoder_prompts, greedy_logprobs_params) def generate_beam_search( - self, - prompts: List[str], - beam_width: int, - max_tokens: int, - ) -> List[Tuple[List[List[int]], List[str]]]: - beam_search_params = SamplingParams(n=beam_width, - use_beam_search=True, - temperature=0.0, - max_tokens=max_tokens) - outputs = self.generate(prompts, beam_search_params) - return outputs - - def generate_beam_search_new( self, prompts: Union[List[str], List[List[int]]], beam_width: int, diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index b3d3667b37d88..033778d2c35e0 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -85,73 +85,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - - # Use a large block size to trigger more copy-on-writes. - "block_size": 32, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "use_v2_block_manager": False -}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "use_v2_block_manager": True, - "preemption_mode": "swap" -}, { - "use_v2_block_manager": True, - "preemption_mode": "recompute" -}]) -@pytest.mark.parametrize("batch_size", [10]) -@pytest.mark.parametrize("seed", [1]) -def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify beam search equality with block manager v1 and v2. - - This requires copy-on-writes; if the v1 and v2 output is the same, then - we have some confidence cow is working. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - use_beam_search=True, - best_of=2, - ) - - print('Getting token ids from block manager v1') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids from block manager v2') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/core/utils.py b/tests/core/utils.py index 1e4332268c2f3..a95a573db7cd3 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -13,7 +13,6 @@ def create_dummy_prompt( prompt_length: int, block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, - use_beam_search: bool = False, best_of: int = 1, prompt_tokens: Optional[List[int]] = None, min_tokens: int = 0, @@ -37,7 +36,6 @@ def create_dummy_prompt( seqs=[prompt], arrival_time=time.time(), sampling_params=SamplingParams( - use_beam_search=use_beam_search, best_of=best_of, max_tokens=max_tokens, min_tokens=min_tokens), @@ -52,7 +50,6 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_length: int, block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, - use_beam_search: bool = False, best_of: int = 1, ) -> Tuple[Sequence, Sequence, SequenceGroup]: if not block_size: @@ -85,9 +82,7 @@ def create_dummy_prompt_encoder_decoder( from_decoder_prompt=False) seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], - sampling_params=SamplingParams( - use_beam_search=use_beam_search, - best_of=best_of), + sampling_params=SamplingParams(best_of=best_of), arrival_time=time.time(), lora_request=lora_request, encoder_seq=encoder_prompt) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index a9bedc2956fdd..4d1a6978d4c55 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -33,8 +33,8 @@ def test_beam_search_single_input( max_tokens) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_beam_search_new( - example_prompts, beam_width, max_tokens) + vllm_outputs = vllm_model.generate_beam_search(example_prompts, + beam_width, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_texts = hf_outputs[i] diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 9d4932dd1f5b1..28c34064f670c 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -159,26 +159,6 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str): assert first_sampler_output == second_sampler_output -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_beam(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams( - temperature=0, - best_of=2, - use_beam_search=True, - ) - _do_sample(batch_size, fake_logits, sampler, sampling_params, device) - # no assertion here as I am not sure how to determine whether - # the outputs are expected - in other words, this just tests - # whether there are no exceptions in the sampler - # when handling an all-beam search case. - - @pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_min_tokens_penalty(seed: int, device: str): @@ -479,7 +459,7 @@ def test_sampler_mixed(seed: int, device: str): seq_lens: List[int] = [] for i in range(batch_size): expected: Optional[List[int]] = None - sampling_type = random.randint(0, 3) + sampling_type = random.randint(0, 2) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] @@ -498,10 +478,7 @@ def test_sampler_mixed(seed: int, device: str): for idx in range(n): fake_logits[i, i + idx] = 1e2 expected = list(range(i, i + n)) - else: - sampling_params = SamplingParams(temperature=0, - use_beam_search=True, - best_of=2) + expected_tokens.append(expected) seq_group_metadata_list.append( SequenceGroupMetadata( @@ -530,9 +507,6 @@ def test_sampling(): zip(sampler_output, seq_group_metadata_list)): assert metadata.sampling_params is not None - if metadata.sampling_params.use_beam_search: - continue - if (metadata.sampling_params.seed is not None and expected_tokens[i] is None): # Record seeded random result to compare with results of diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f3a5016d0e62a..c57e6cd716405 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1202,9 +1202,9 @@ def _can_append_slots(self, seq_group: SequenceGroup, seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: + # TODO: does it work with parallel sampling? no_beam_search = seq_group.sampling_params is None or ( - seq_group.sampling_params.best_of == 1 - and not seq_group.sampling_params.use_beam_search) + seq_group.sampling_params.best_of == 1) return no_beam_search def schedule( diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index a0aaa9e6c372a..50269493d64e9 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -33,7 +33,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (collect_from_async_generator, deprecate_kwargs, - random_uuid, weak_bind) + get_beam_search_score, random_uuid, weak_bind) logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -1050,6 +1050,12 @@ async def beam_search( max_tokens = params.max_tokens ignore_eos = params.ignore_eos temperature = params.temperature + length_penalty = params.length_penalty + + def sort_beams_key(x: BeamSearchSequence) -> float: + return get_beam_search_score(x.tokens, x.cum_logprob, + tokenizer.eos_token_id, + length_penalty) tokenizer = await self.get_tokenizer() tokenizedPrompt = prompt if isinstance( @@ -1103,15 +1109,11 @@ async def beam_search( else: new_beams.append(new_beam) - sorted_beams = sorted(new_beams, - key=lambda x: x.cum_logprob, - reverse=True) + sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True) all_beams = sorted_beams[:beam_width] completed.extend(all_beams) - sorted_completed = sorted(completed, - key=lambda x: x.cum_logprob, - reverse=True) + sorted_completed = sorted(completed, key=sort_beams_key, reverse=True) best_beams = sorted_completed[:beam_width] for beam in best_beams: diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index e288aa0c4aafd..00d9297e41d99 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Tuple from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -6,7 +6,6 @@ SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger -from vllm.sampling_params import SamplingParams from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer @@ -113,7 +112,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput, is_async: bool) -> None: sampling_params = seq_group.sampling_params - if sampling_params.best_of == 1 and not sampling_params.use_beam_search: + if sampling_params.best_of == 1: # only have one output sample sample = outputs.samples[0] # only have one sequence @@ -142,7 +141,6 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Process samples samples = outputs.samples parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - existing_finished_seqs = seq_group.get_finished_seqs() parent_child_dict: Dict[int, List[SequenceOutput]] = { parent_seq.seq_id: [] for parent_seq in parent_seqs @@ -197,106 +195,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, lora_req=seq_group.lora_request, ) - # Non-beam search case - if not sampling_params.use_beam_search: - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - for scheduler in self.scheduler: - scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - # NOTE: we need to fork the new sequences before freeing the - # old sequences. - for seq, parent in child_seqs: - if seq is parent and seq.is_finished(): - for scheduler in self.scheduler: - scheduler.free_seq(seq) - return - - # Beam search case - # Select the child sequences to keep in the sequence group. - selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] - unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] - beam_width = sampling_params.best_of - length_penalty = sampling_params.length_penalty - - # Select the newly finished sequences with the highest scores - # to replace existing finished sequences. - # Tuple of (seq, parent, is_new) - existing_finished_seqs = [(seq, None, False) - for seq in existing_finished_seqs] - new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - if seq.is_finished()] - all_finished_seqs = existing_finished_seqs + new_finished_seqs - # Sort the finished sequences by their scores. - all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - for seq, parent, is_new in all_finished_seqs[:beam_width]: - if is_new: - # A newly generated child sequence finishes and has a high - # score, so we will add it into the sequence group. - selected_child_seqs.append((seq, parent)) - for seq, parent, is_new in all_finished_seqs[beam_width:]: - if is_new: - # A newly generated child sequence finishes but has a low - # score, so we will not add it into the sequence group. - # Additionally, if this sequence is a continuation of a - # parent sequence, we will need remove the parent sequence - # from the sequence group. - unselected_child_seqs.append((seq, parent)) - else: - # An existing finished sequence has a low score, so we will - # remove it from the sequence group. - seq_group.remove(seq.seq_id) - - # select the top beam_width sequences from the running - # sequences for the next iteration to continue the beam - # search. - running_child_seqs = [(seq, parent) for seq, parent in child_seqs - if not seq.is_finished()] - # Sort the running sequences by their scores. - running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - - # Check if we can stop the beam search. - if len(running_child_seqs) == 0: - # No running sequences, stop the beam search. - stop_beam_search = True - elif len(all_finished_seqs) < beam_width: - # Not enough finished sequences, continue the beam search. - stop_beam_search = False - else: - # Check the early stopping criteria - best_running_seq = running_child_seqs[0][0] - current_worst_seq = all_finished_seqs[beam_width - 1][0] - stop_beam_search = self._check_beam_search_early_stopping( - sampling_params.early_stopping, sampling_params, - best_running_seq, current_worst_seq) - - if stop_beam_search: - # Stop the beam search and remove all the running sequences from - # the sequence group. - unselected_child_seqs.extend(running_child_seqs) - else: - # Continue the beam search and select the top beam_width sequences - # to continue the beam search. - selected_child_seqs.extend(running_child_seqs[:beam_width]) - # The remaining running sequences will not be used in the next - # iteration. Again, if these sequences are continuations of - # parent sequences, we will need to remove the parent sequences - # from the sequence group. - unselected_child_seqs.extend(running_child_seqs[beam_width:]) - # For newly created child sequences, add them to the sequence group # and fork them in block manager if they are not finished. - for seq, parent in selected_child_seqs: + for seq, parent in child_seqs: if seq is not parent: seq_group.add(seq) if not seq.is_finished(): @@ -305,61 +206,10 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Free the finished and selected parent sequences' memory in block # manager. Keep them in the sequence group as candidate output. - for seq, parent in selected_child_seqs: + # NOTE: we need to fork the new sequences before freeing the + # old sequences. + for seq, parent in child_seqs: if seq is parent and seq.is_finished(): for scheduler in self.scheduler: scheduler.free_seq(seq) - - # Remove the unselected parent sequences from the sequence group and - # free their memory in block manager. - for seq, parent in unselected_child_seqs: - if seq is parent: - # Remove the parent sequence if it is not selected for next - # iteration - seq_group.remove(seq.seq_id) - for scheduler in self.scheduler: - scheduler.free_seq(seq) - - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=current_worst_seq.eos_token_id) - if early_stopping is False: - highest_attainable_score = best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id)) - return current_worst_score >= highest_attainable_score + return diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1cb35ee92348d..439f3769f9fbd 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -28,7 +28,8 @@ get_cached_tokenizer) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, deprecate_kwargs, is_list_of +from vllm.utils import (Counter, deprecate_kwargs, get_beam_search_score, + is_list_of) logger = init_logger(__name__) @@ -404,6 +405,12 @@ def beam_search( max_tokens = params.max_tokens temperature = params.temperature ignore_eos = params.ignore_eos + length_penalty = params.length_penalty + + def sort_beams_key(x: BeamSearchSequence) -> float: + return get_beam_search_score(x.tokens, x.cum_logprob, + tokenizer.eos_token_id, + length_penalty) tokenizer = self.get_tokenizer() # generate 2 * beam_width candidates at each step @@ -466,7 +473,7 @@ def beam_search( else: instance_new_beams.append(new_beam) sorted_beams = sorted(instance_new_beams, - key=lambda x: x.cum_logprob, + key=sort_beams_key, reverse=True) instance.beams = sorted_beams[:beam_width] @@ -474,7 +481,7 @@ def beam_search( for instance in instances: instance.completed.extend(instance.beams) sorted_completed = sorted(instance.completed, - key=lambda x: x.cum_logprob, + key=sort_beams_key, reverse=True) best_beams = sorted_completed[:beam_width] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f0aaf3733869d..6f1135f8093ba 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -184,7 +184,6 @@ class ChatCompletionRequest(OpenAIBaseModel): min_p: float = 0.0 repetition_penalty: float = 1.0 length_penalty: float = 1.0 - early_stopping: bool = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False @@ -302,6 +301,7 @@ def to_beam_search_params(self, max_tokens=max_tokens, ignore_eos=self.ignore_eos, temperature=temperature, + length_penalty=self.length_penalty, ) def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: @@ -345,12 +345,9 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: ignore_eos=self.ignore_eos, max_tokens=max_tokens, min_tokens=self.min_tokens, - use_beam_search=self.use_beam_search, - early_stopping=self.early_stopping, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - length_penalty=self.length_penalty, truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, @@ -518,7 +515,6 @@ class CompletionRequest(OpenAIBaseModel): min_p: float = 0.0 repetition_penalty: float = 1.0 length_penalty: float = 1.0 - early_stopping: bool = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False @@ -597,6 +593,7 @@ def to_beam_search_params(self, max_tokens=max_tokens, ignore_eos=self.ignore_eos, temperature=temperature, + length_penalty=self.length_penalty, ) def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: @@ -641,13 +638,10 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: ignore_eos=self.ignore_eos, max_tokens=max_tokens if not echo_without_generation else 1, min_tokens=self.min_tokens, - use_beam_search=self.use_beam_search, - early_stopping=self.early_stopping, prompt_logprobs=prompt_logprobs, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - length_penalty=self.length_penalty, truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, diff --git a/vllm/envs.py b/vllm/envs.py index 0f46ac4f61fdf..d15cded416385 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -63,7 +63,6 @@ VLLM_TORCH_PROFILER_DIR: Optional[str] = None VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False - VLLM_ALLOW_DEPRECATED_BEAM_SEARCH: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -198,10 +197,6 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), - # If set, allowing the use of deprecated beam search implementation - "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH": - lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BEAM_SEARCH", "0") == "1", - # Internal flag to enable Dynamo graph capture "VLLM_TEST_DYNAMO_GRAPH_CAPTURE": lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")), diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cfa857b8f9606..0b959da79c3be 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -947,8 +947,6 @@ def get_logprobs( # largest num logprobs in this API. If every logprobs is None, it will be # set to -1. largest_num_logprobs = -1 - # If beam search is enabled. - use_beam_search = False # Select indices to compute logprob from, ranks of token ids, and the top # k token ids from logprobs. @@ -981,8 +979,6 @@ def get_logprobs( largest_num_logprobs = max(largest_num_logprobs, sampling_params.logprobs) - use_beam_search = use_beam_search or sampling_params.use_beam_search - assert len(next_token_ids) == len(query_indices) if len(query_indices) == 0: @@ -995,7 +991,7 @@ def get_logprobs( # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can # skip the whole logprob calculation. - if largest_num_logprobs >= 0 or use_beam_search: + if largest_num_logprobs >= 0: query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) @@ -1121,13 +1117,12 @@ def _get_sampled_logprob_if_needed( """Compute the sample logprob if needed.""" seq_ids = seq_group.seq_ids num_logprobs = seq_group.sampling_params.logprobs - use_beam_search = seq_group.sampling_params.use_beam_search sampled_logprobs: SampleLogprobs = [] next_token_ids, parent_seq_ids = sample_result if seq_group.do_sample: assert len(next_token_ids) > 0 - if num_logprobs is None and not use_beam_search: + if num_logprobs is None: for next_token_id in next_token_ids: # Use a dummy logprob sampled_logprobs.append({next_token_id: Logprob(inf)}) diff --git a/vllm/outputs.py b/vllm/outputs.py index 44cde6b561d85..4f29226aa5128 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -142,11 +142,7 @@ def from_seq_group(cls, seq_group: SequenceGroup, else: # Get the top-n sequences. n = sampling_params.n - if sampling_params.use_beam_search: - sorting_key = lambda seq: seq.get_beam_search_score( - sampling_params.length_penalty) - else: - sorting_key = lambda seq: seq.get_cumulative_logprob() + sorting_key = lambda seq: seq.get_cumulative_logprob() sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) top_n_seqs = sorted_seqs[:n] diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index adf0d2dd6ca2f..e074312280584 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -10,7 +10,6 @@ from pydantic import BaseModel from typing_extensions import Annotated -import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -23,7 +22,6 @@ class SamplingType(IntEnum): GREEDY = 0 RANDOM = 1 RANDOM_SEED = 2 - BEAM = 3 LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], @@ -134,16 +132,6 @@ class SamplingParams( considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this. seed: Random seed to use for the generation. - use_beam_search: Whether to use beam search instead of sampling. - length_penalty: Float that penalizes sequences based on their length. - Used in beam search. - early_stopping: Controls the stopping condition for beam search. It - accepts the following values: `True`, where the generation stops as - soon as there are `best_of` complete candidates; `False`, where an - heuristic is applied and the generation stops when is it very - unlikely to find better candidates; `"never"`, where the beam search - procedure only stops when there cannot be better candidates - (canonical beam search algorithm). stop: List of strings that stop the generation when they are generated. The returned output will not contain the stop strings. stop_token_ids: List of tokens that stop the generation when they are @@ -193,9 +181,6 @@ class SamplingParams( top_k: int = -1 min_p: float = 0.0 seed: Optional[int] = None - use_beam_search: bool = False - length_penalty: float = 1.0 - early_stopping: Union[bool, str] = False stop: Optional[Union[str, List[str]]] = None stop_token_ids: Optional[List[int]] = None ignore_eos: bool = False @@ -238,9 +223,6 @@ def from_optional( top_k: int = -1, min_p: float = 0.0, seed: Optional[int] = None, - use_beam_search: bool = False, - length_penalty: float = 1.0, - early_stopping: Union[bool, str] = False, stop: Optional[Union[str, List[str]]] = None, stop_token_ids: Optional[List[int]] = None, include_stop_str_in_output: bool = False, @@ -280,9 +262,6 @@ def from_optional( top_k=top_k, min_p=min_p, seed=seed, - use_beam_search=use_beam_search, - length_penalty=length_penalty, - early_stopping=early_stopping, stop=stop, stop_token_ids=stop_token_ids, include_stop_str_in_output=include_stop_str_in_output, @@ -334,20 +313,13 @@ def __post_init__(self) -> None: self.output_text_buffer_length = max(len(s) for s in self.stop) - 1 self._verify_args() - if self.use_beam_search: - if not envs.VLLM_ALLOW_DEPRECATED_BEAM_SEARCH: - raise ValueError( - "Using beam search as a sampling parameter is deprecated, and will be removed in the future release. Please use the `vllm.LLM.use_beam_search` method for dedicated beam search instead, or set the environment variable `VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1` to suppress this error. For more details, see https://github.com/vllm-project/vllm/issues/8306 ." # noqa - ) - self._verify_beam_search() - else: - self._verify_non_beam_search() - if self.temperature < _SAMPLING_EPS: - # Zero temperature means greedy sampling. - self.top_p = 1.0 - self.top_k = -1 - self.min_p = 0.0 - self._verify_greedy_sampling() + + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = -1 + self.min_p = 0.0 + self._verify_greedy_sampling() # eos_token_id is added to this by the engine self._all_stop_token_ids = set(self.stop_token_ids) @@ -417,31 +389,6 @@ def _verify_args(self) -> None: RequestOutputKind.DELTA): raise ValueError("best_of must equal n to use output_kind=DELTA") - def _verify_beam_search(self) -> None: - if self.best_of == 1: - raise ValueError("best_of must be greater than 1 when using beam " - f"search. Got {self.best_of}.") - if self.temperature > _SAMPLING_EPS: - raise ValueError("temperature must be 0 when using beam search.") - if self.top_p < 1.0 - _SAMPLING_EPS: - raise ValueError("top_p must be 1 when using beam search.") - if self.top_k != -1: - raise ValueError("top_k must be -1 when using beam search.") - if self.early_stopping not in [True, False, "never"]: - raise ValueError( - f"early_stopping must be True, False, or 'never', " - f"got {self.early_stopping}.") - - def _verify_non_beam_search(self) -> None: - if self.early_stopping is not False: - raise ValueError("early_stopping is not effective and must be " - "False when not using beam search.") - if (self.length_penalty < 1.0 - _SAMPLING_EPS - or self.length_penalty > 1.0 + _SAMPLING_EPS): - raise ValueError( - "length_penalty is not effective and must be the " - "default value of 1.0 when not using beam search.") - def _verify_greedy_sampling(self) -> None: assert isinstance(self.best_of, int) if self.best_of > 1: @@ -476,8 +423,6 @@ def update_from_generation_config( @cached_property def sampling_type(self) -> SamplingType: - if self.use_beam_search: - return SamplingType.BEAM if self.temperature < _SAMPLING_EPS: return SamplingType.GREEDY if self.seed is not None: @@ -514,9 +459,6 @@ def __repr__(self) -> str: f"top_k={self.top_k}, " f"min_p={self.min_p}, " f"seed={self.seed}, " - f"use_beam_search={self.use_beam_search}, " - f"length_penalty={self.length_penalty}, " - f"early_stopping={self.early_stopping}, " f"stop={self.stop}, " f"stop_token_ids={self.stop_token_ids}, " f"include_stop_str_in_output={self.include_stop_str_in_output}, " @@ -542,3 +484,4 @@ class BeamSearchParams( max_tokens: int ignore_eos: bool = False temperature: float = 0.0 + length_penalty: float = 1.0 diff --git a/vllm/sequence.py b/vllm/sequence.py index 781bcedde2b52..9116408a001ff 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -577,25 +577,6 @@ def get_output_token_ids(self) -> Tuple[int, ...]: def get_cumulative_logprob(self) -> float: return self.data.cumulative_logprob - def get_beam_search_score(self, - length_penalty: float = 1.0, - seq_len: Optional[int] = None, - eos_token_id: Optional[int] = None) -> float: - """Calculate the beam search score with length penalty. - - Adapted from - - https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 - """ - if seq_len is None: - seq_len = self.get_len() - # NOTE: HF implementation does not count the EOS token - # towards the length, we align with that here for testing. - if (eos_token_id is not None - and self.get_last_token_id() == eos_token_id): - seq_len -= 1 - return self.get_cumulative_logprob() / (seq_len**length_penalty) - def is_finished(self) -> bool: return SequenceStatus.is_finished(self.status) @@ -809,25 +790,18 @@ def set_finished_time(self, time: Optional[float]) -> None: def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" - if self.sampling_params and self.sampling_params.use_beam_search: - # For beam search, maximally there will always be `best_of` beam - # candidates running in the future. + if self.sampling_params: best_of = self.sampling_params.best_of assert isinstance(best_of, int) - return best_of - else: - if self.sampling_params: - best_of = self.sampling_params.best_of - assert isinstance(best_of, int) - if best_of > self.num_seqs(): - # At prompt stage, the sequence group is not yet filled up - # and only have one sequence running. However, in the - # generation stage, we will have `best_of` sequences - # running. - return best_of - # At sampling stages, return the number of actual sequences - # that are not finished yet. - return self.num_unfinished_seqs() + if best_of > self.num_seqs(): + # At prompt stage, the sequence group is not yet filled up + # and only have one sequence running. However, in the + # generation stage, we will have `best_of` sequences + # running. + return best_of + # At sampling stages, return the number of actual sequences + # that are not finished yet. + return self.num_unfinished_seqs() def get_seqs( self, diff --git a/vllm/utils.py b/vllm/utils.py index e44365fa24990..1b7638c4a12ac 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1361,3 +1361,22 @@ def dec(self, num=1): @property def value(self): return self._value + + +def get_beam_search_score( + tokens: List[int], + cumulative_logprob: float, + eos_token_id: int, + length_penalty: float = 1.0, +) -> float: + """Calculate the beam search score with length penalty. + + Adapted from + + https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 + """ + seq_len = len(tokens) + if tokens[-1] == eos_token_id: + seq_len -= 1 + + return cumulative_logprob / (seq_len**length_penalty) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 2472ac25aee44..12e4215038d74 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -453,9 +453,6 @@ def _prepare_sample( f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU " "backend.") best_of.append(sampling_params.best_of) - if sampling_params.use_beam_search: - raise NotImplementedError( - "Beam search is not supported by the TPU backend.") if sampling_params.logprobs is not None: raise NotImplementedError( "logprobs is not currently supported by the TPU backend.") From 8c6de96ea1e6e51e49a170c28ad3efc16db9413e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 7 Oct 2024 14:10:35 +0800 Subject: [PATCH 04/12] [Model] Explicit interface for vLLM models and support OOT embedding models (#9108) --- tests/conftest.py | 20 ++ tests/models/test_oot_registration.py | 18 +- tests/models/test_registry.py | 24 ++- .../vllm_add_dummy_model/__init__.py | 6 + .../my_gemma_embedding.py | 34 ++++ vllm/model_executor/models/__init__.py | 7 + vllm/model_executor/models/interfaces.py | 28 +-- vllm/model_executor/models/interfaces_base.py | 191 ++++++++++++++++++ vllm/model_executor/models/registry.py | 42 +++- vllm/utils.py | 9 + 10 files changed, 342 insertions(+), 37 deletions(-) create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py create mode 100644 vllm/model_executor/models/interfaces_base.py diff --git a/tests/conftest.py b/tests/conftest.py index 713be09ca96ea..baa6bae03a451 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -871,6 +871,7 @@ def num_gpus_available(): temp_dir = tempfile.gettempdir() _dummy_opt_path = os.path.join(temp_dir, "dummy_opt") _dummy_llava_path = os.path.join(temp_dir, "dummy_llava") +_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding") @pytest.fixture @@ -909,3 +910,22 @@ def dummy_llava_path(): with open(json_path, "w") as f: json.dump(config, f) return _dummy_llava_path + + +@pytest.fixture +def dummy_gemma2_embedding_path(): + json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json") + if not os.path.exists(_dummy_gemma2_embedding_path): + snapshot_download(repo_id="BAAI/bge-multilingual-gemma2", + local_dir=_dummy_gemma2_embedding_path, + ignore_patterns=[ + "*.bin", "*.bin.index.json", "*.pt", "*.h5", + "*.msgpack" + ]) + assert os.path.exists(json_path) + with open(json_path, "r") as f: + config = json.load(f) + config["architectures"] = ["MyGemma2Embedding"] + with open(json_path, "w") as f: + json.dump(config, f) + return _dummy_gemma2_embedding_path diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index ee3f8911f318c..94be215258f89 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -2,7 +2,7 @@ import pytest -from vllm import LLM, SamplingParams +from vllm import LLM, PoolingParams, SamplingParams from vllm.assets.image import ImageAsset from ..utils import fork_new_process_for_each_test @@ -17,7 +17,7 @@ def test_plugin(dummy_opt_path): @fork_new_process_for_each_test -def test_oot_registration(dummy_opt_path): +def test_oot_registration_text_generation(dummy_opt_path): os.environ["VLLM_PLUGINS"] = "register_dummy_model" prompts = ["Hello, my name is", "The text does not matter"] sampling_params = SamplingParams(temperature=0) @@ -32,11 +32,23 @@ def test_oot_registration(dummy_opt_path): assert rest == "" +@fork_new_process_for_each_test +def test_oot_registration_embedding(dummy_gemma2_embedding_path): + os.environ["VLLM_PLUGINS"] = "register_dummy_model" + prompts = ["Hello, my name is", "The text does not matter"] + sampling_params = PoolingParams() + llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") + outputs = llm.encode(prompts, sampling_params) + + for output in outputs: + assert all(v == 0 for v in output.outputs.embedding) + + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") @fork_new_process_for_each_test -def test_oot_multimodal_registration(dummy_llava_path): +def test_oot_registration_multimodal(dummy_llava_path): os.environ["VLLM_PLUGINS"] = "register_dummy_model" prompts = [{ "prompt": "What's in the image?", diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 299aeacb9f337..a2194fa15f90e 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -3,7 +3,14 @@ import pytest import torch.cuda -from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.models import (is_embedding_model, + is_text_generation_model, + supports_multimodal) +from vllm.model_executor.models.registry import (_EMBEDDING_MODELS, + _MULTIMODAL_MODELS, + _SPECULATIVE_DECODING_MODELS, + _TEXT_GENERATION_MODELS, + ModelRegistry) from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test @@ -12,7 +19,20 @@ @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): # Ensure all model classes can be imported successfully - ModelRegistry.resolve_model_cls(model_arch) + model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) + + if model_arch in _SPECULATIVE_DECODING_MODELS: + pass # Ignore these models which do not have a unified format + else: + assert is_text_generation_model(model_cls) is ( + model_arch in _TEXT_GENERATION_MODELS + or model_arch in _MULTIMODAL_MODELS) + + assert is_embedding_model(model_cls) is (model_arch + in _EMBEDDING_MODELS) + + assert supports_multimodal(model_cls) is (model_arch + in _MULTIMODAL_MODELS) @fork_new_process_for_each_test diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py index 022ba66e38cc3..62a8f871fa51b 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py @@ -9,6 +9,12 @@ def register(): ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM) # Test passing lazy model + if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model( + "MyGemma2Embedding", + "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding", + ) + if "MyLlava" not in ModelRegistry.get_supported_archs(): ModelRegistry.register_model("MyLlava", "vllm_add_dummy_model.my_llava:MyLlava") diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py new file mode 100644 index 0000000000000..1d61f6b74f520 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -0,0 +1,34 @@ +from typing import List, Optional, Union + +import torch + +from vllm.attention import AttentionMetadata +from vllm.model_executor.models.gemma2_embedding import Gemma2EmbeddingModel +from vllm.sequence import IntermediateTensors + + +class MyGemma2Embedding(Gemma2EmbeddingModel): + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = super().forward( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + if isinstance(hidden_states, IntermediateTensors): + return hidden_states + + # Return all-zero embeddings + return torch.zeros_like(hidden_states) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 51054a147a06f..eaa2b93eb3331 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,10 +1,17 @@ from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, has_inner_state, supports_lora, supports_multimodal, supports_pp) +from .interfaces_base import (VllmModelForEmbedding, + VllmModelForTextGeneration, is_embedding_model, + is_text_generation_model) from .registry import ModelRegistry __all__ = [ "ModelRegistry", + "VllmModelForEmbedding", + "is_embedding_model", + "VllmModelForTextGeneration", + "is_text_generation_model", "HasInnerState", "has_inner_state", "SupportsLoRA", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 298174fa05965..278dfc52078ef 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,4 +1,3 @@ -import inspect from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, Protocol, Type, Union, overload, runtime_checkable) @@ -6,9 +5,9 @@ from typing_extensions import TypeIs from vllm.logger import init_logger +from vllm.utils import supports_kw if TYPE_CHECKING: - from vllm.attention import AttentionMetadata from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.sequence import IntermediateTensors @@ -142,9 +141,7 @@ def supports_lora( return result -def _supports_lora( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: +def _supports_lora(model: Union[Type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -175,10 +172,7 @@ def make_empty_intermediate_tensors( def forward( self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: "AttentionMetadata", + *, intermediate_tensors: Optional["IntermediateTensors"], ) -> Union[torch.Tensor, "IntermediateTensors"]: """ @@ -205,10 +199,7 @@ def make_empty_intermediate_tensors( def forward( self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: "AttentionMetadata", + *, intermediate_tensors: Optional["IntermediateTensors"], ) -> Union[torch.Tensor, "IntermediateTensors"]: ... @@ -257,24 +248,19 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: +def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: +def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False - forward_params = inspect.signature(model_forward).parameters - return "intermediate_tensors" in forward_params + return supports_kw(model_forward, "intermediate_tensors") @runtime_checkable diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py new file mode 100644 index 0000000000000..8d2d422f9891c --- /dev/null +++ b/vllm/model_executor/models/interfaces_base.py @@ -0,0 +1,191 @@ +from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union, + overload, runtime_checkable) + +import torch +import torch.nn as nn +from transformers import PretrainedConfig +from typing_extensions import TypeIs, TypeVar + +from vllm.logger import init_logger +from vllm.utils import supports_kw + +if TYPE_CHECKING: + from vllm.attention import AttentionMetadata + from vllm.config import CacheConfig + from vllm.model_executor.layers.pooler import PoolerOutput + from vllm.model_executor.layers.quantization import QuantizationConfig + from vllm.model_executor.layers.sampler import SamplerOutput + from vllm.model_executor.pooling_metadata import PoolingMetadata + from vllm.model_executor.sampling_metadata import SamplingMetadata + +logger = init_logger(__name__) + +# The type of HF config +C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True) + +# The type of hidden states +# Currently, T = torch.Tensor for all models except for Medusa +# which has T = List[torch.Tensor] +T = TypeVar("T", default=torch.Tensor) +T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) + +# NOTE: Unlike those in `interfaces.py`, we don't define `ClassVar` tags +# for the base interfaces to avoid breaking OOT registration for existing models +# that don't inherit from the base interface classes + + +@runtime_checkable +class VllmModel(Protocol[C_co, T_co]): + + def __init__( + self, + config: C_co, + *, + cache_config: Optional["CacheConfig"], + quant_config: Optional["QuantizationConfig"], + ) -> None: + ... + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: "AttentionMetadata", + ) -> T_co: + ... + + +def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: + model_init = model.__init__ + vllm_kws = ("cache_config", "quant_config") + missing_kws = tuple(kw for kw in vllm_kws + if not supports_kw(model_init, kw)) + + if missing_kws and (isinstance(model, type) + and issubclass(model, nn.Module)): + logger.warning( + "The model (%s) is missing " + "vLLM-specific keywords from its initializer: %s", + model, + missing_kws, + ) + + return len(missing_kws) == 0 + + +def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: + model_forward = getattr(model, "forward", None) + if not callable(model_forward): + return False + + vllm_kws = ("input_ids", "positions", "kv_caches", "attn_metadata") + missing_kws = tuple(kw for kw in vllm_kws + if not supports_kw(model_forward, kw)) + + if missing_kws and (isinstance(model, type) + and issubclass(model, nn.Module)): + logger.warning( + "The model (%s) is missing " + "vLLM-specific keywords from its initializer: %s", + model, + missing_kws, + ) + + return len(missing_kws) == 0 + + +@overload +def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: + ... + + +@overload +def is_vllm_model(model: object) -> TypeIs[VllmModel]: + ... + + +def is_vllm_model( + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: + return _check_vllm_model_init(model) and _check_vllm_model_forward(model) + + +@runtime_checkable +class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]): + + def compute_logits( + self, + hidden_states: T, + sampling_metadata: "SamplingMetadata", + ) -> Optional[T]: + """Return `None` if TP rank > 0.""" + ... + + def sample( + self, + logits: T, + sampling_metadata: "SamplingMetadata", + ) -> "SamplerOutput": + """Only called on TP rank 0.""" + ... + + +@overload +def is_text_generation_model( + model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: + ... + + +@overload +def is_text_generation_model( + model: object) -> TypeIs[VllmModelForTextGeneration]: + ... + + +def is_text_generation_model( + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModelForTextGeneration]], + TypeIs[VllmModelForTextGeneration]]: + if not is_vllm_model(model): + return False + + if isinstance(model, type): + return isinstance(model, VllmModelForTextGeneration) + + return isinstance(model, VllmModelForTextGeneration) + + +@runtime_checkable +class VllmModelForEmbedding(VllmModel[C_co, T], Protocol[C_co, T]): + + def pooler( + self, + hidden_states: T, + pooling_metadata: "PoolingMetadata", + ) -> "PoolerOutput": + """Only called on TP rank 0.""" + ... + + +@overload +def is_embedding_model( + model: Type[object]) -> TypeIs[Type[VllmModelForEmbedding]]: + ... + + +@overload +def is_embedding_model(model: object) -> TypeIs[VllmModelForEmbedding]: + ... + + +def is_embedding_model( + model: Union[Type[object], object], +) -> Union[TypeIs[Type[VllmModelForEmbedding]], TypeIs[VllmModelForEmbedding]]: + if not is_vllm_model(model): + return False + + if isinstance(model, type): + return isinstance(model, VllmModelForEmbedding) + + return isinstance(model, VllmModelForEmbedding) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ccb0e155ff4aa..46c69f17f4471 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -12,10 +12,12 @@ from vllm.utils import is_hip from .interfaces import supports_multimodal, supports_pp +from .interfaces_base import is_embedding_model, is_text_generation_model logger = init_logger(__name__) -_GENERATION_MODELS = { +_TEXT_GENERATION_MODELS = { + # [Decoder-only] "AquilaModel": ("llama", "LlamaForCausalLM"), "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), @@ -74,10 +76,9 @@ "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "SolarForCausalLM": ("solar", "SolarForCausalLM"), "XverseForCausalLM": ("xverse", "XverseForCausalLM"), - # NOTE: The below models are for speculative decoding only - "MedusaModel": ("medusa", "Medusa"), - "EAGLEModel": ("eagle", "EAGLE"), - "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), + # [Encoder-decoder] + "BartModel": ("bart", "BartForConditionalGeneration"), + "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), } _EMBEDDING_MODELS = { @@ -114,16 +115,18 @@ "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), } -_CONDITIONAL_GENERATION_MODELS = { - "BartModel": ("bart", "BartForConditionalGeneration"), - "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + +_SPECULATIVE_DECODING_MODELS = { + "EAGLEModel": ("eagle", "EAGLE"), + "MedusaModel": ("medusa", "Medusa"), + "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } _MODELS = { - **_GENERATION_MODELS, + **_TEXT_GENERATION_MODELS, **_EMBEDDING_MODELS, **_MULTIMODAL_MODELS, - **_CONDITIONAL_GENERATION_MODELS, + **_SPECULATIVE_DECODING_MODELS, } # Architecture -> type or (module, class). @@ -317,6 +320,19 @@ def _check_stateless( return result.returncode == 0 + @staticmethod + def is_text_generation_model(architectures: Union[str, List[str]]) -> bool: + if isinstance(architectures, str): + architectures = [architectures] + if not architectures: + logger.warning("No model architectures are specified") + + is_txt_gen = partial(ModelRegistry._check_stateless, + is_text_generation_model, + default=False) + + return any(is_txt_gen(arch) for arch in architectures) + @staticmethod def is_embedding_model(architectures: Union[str, List[str]]) -> bool: if isinstance(architectures, str): @@ -324,7 +340,11 @@ def is_embedding_model(architectures: Union[str, List[str]]) -> bool: if not architectures: logger.warning("No model architectures are specified") - return any(arch in _EMBEDDING_MODELS for arch in architectures) + is_emb = partial(ModelRegistry._check_stateless, + is_embedding_model, + default=False) + + return any(is_emb(arch) for arch in architectures) @staticmethod def is_multimodal_model(architectures: Union[str, List[str]]) -> bool: diff --git a/vllm/utils.py b/vllm/utils.py index 1b7638c4a12ac..9c6f1a347fb83 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1277,6 +1277,15 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, return await task(*args, **kwargs) +def supports_kw(callable: Callable[..., object], kw_name: str) -> bool: + params = inspect.signature(callable).parameters + if kw_name in params: + return True + + return any(param.kind == inspect.Parameter.VAR_KEYWORD + for param in params.values()) + + def get_allowed_kwarg_only_overrides( callable: Callable[..., object], overrides: Optional[Dict[str, Any]], From 4f95ffee6f40198911ee824ed06d645fe9678511 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Oct 2024 14:50:35 +0800 Subject: [PATCH 05/12] [Hardware][CPU] Cross-attention and Encoder-Decoder models support on CPU backend (#9089) --- .buildkite/run-cpu-test.sh | 1 + .../encoder_decoder/language/test_bart.py | 428 +++++++++--------- vllm/attention/backends/torch_sdpa.py | 360 ++++++++++++--- vllm/worker/cpu_enc_dec_model_runner.py | 311 +++++++++++++ vllm/worker/cpu_model_runner.py | 10 +- vllm/worker/cpu_worker.py | 11 +- 6 files changed, 834 insertions(+), 287 deletions(-) create mode 100644 vllm/worker/cpu_enc_dec_model_runner.py diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 73ce82c5857ab..c1c471ec974f8 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,6 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator + pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language \ --ignore=tests/models/test_fp8.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \ diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 758a9b743b397..8e8862fadbf04 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -4,220 +4,214 @@ """ from typing import List, Optional, Tuple, Type -from vllm.utils import is_cpu - -if not is_cpu(): - # CPU backend is not currently supported with encoder/decoder models - # skip test definitions entirely to avoid importing GPU kernel libs - # (xFormers, etc.) - - import pytest - from transformers import AutoModelForSeq2SeqLM - - from vllm.sequence import SampleLogprobs - - from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt, - HfRunner, VllmRunner) - from ....utils import multi_gpu_test - from ...utils import check_logprobs_close - - MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] - - def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, - ): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "" + hf_output_str - - return output_ids, hf_output_str, out_logprobs - - def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - prompts: List[ExplicitEncoderDecoderPrompt[str, str]], - decoder_prompt_type: DecoderPromptType, - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, - ) -> None: - ''' - Test the vLLM BART model for a variety of encoder/decoder input prompts, - by validating it against HuggingFace (HF) BART. - - Arguments: - - * hf_runner: HuggingFace (HF) test model runner - * vllm_runner: vLLM test model runner - * example_encoder_decoder_prompts: test fixture which provides a - dictionary of dummy prompts - * model: the HF ID of the specific BART variant under test - * dtype: the tensor datatype to employ - * max_tokens - * num_logprobs - * decoder_prompt_type: key into the example_encoder_decoder_prompts - dictionary; selects specific encoder/decoder - prompt scenarios to test - - A note on using HF BART as a baseline for validating vLLM BART, - specifically when the decoder prompt is None. - - The HF GenerationMixin's default behavior is to force the first - decoded token to be if the prompt does not already contain - (this is accomplished using a logit - processor setting.) - - So when we use HF BART as our baseline for comparison, note that - when the user provides a request with a None decoder prompt - (i.e. a singleton encoder prompt, or else an explicit encoder/ - decoder prompt with the decoder sub-prompt set to None), HF and - vLLM handle this in different ways: - - * HF will (1) tokenize the None prompt as an empty token-list, - (2) append to the beginning, yielding - [], (3) pass this token list to the model, and - then (4) after computing logits during prefill, override the model - logits & force to be the first generated token. - - * vLLM will (1) tokenize the None prompt as [], (2) append decoder- - start-token to the beginning, yielding [], - (3) pass these tokens to the model & proceed with generation. - - The net effect is that compared to vLLM, the list of HF *decoded* tokens - will contain one more initial than the vLLM generated tokens, - because vLLM's token is injected into the prompt rather than into - the generated output. This is in spite of the fact that overall, the - complete sequences (prompt + decoded tokens) produced by vLLM will match - HF. - - So when we use HF decoded token output to validate vLLM's decoded token - output, the testing process must account for the difference in decoded - token sequences between vLLM and HF specifically in the - decoder-prompt-is-None case. - - One option is to disable the logit processor feature that forces the - token to be decoded (forced_bos_token_id = None), eliminating - the problem entirely. However this is not "normal" BART usage. - - The other option is - only in the decoder-prompt-is-None case - to - discard the first decoded token from the HF output before comparing it - to vLLM. - - To that end, when testing the scenario where the decoder prompt is None - (and only in that one scenario), this test skips the first HF decoded - token during the process of validating the vLLM decoded output. - ''' - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default). - - # Note: currently encoder/decoder models are only compatible with - # enforce_eager=True. Normally this is not a problem because - # for encoder/decoder models vLLM will - # default to enforce_eager=True if enforce_eager - # is left unspecified. However, the - # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-exisitng - # decoder-only unit tests expect), so when testing an encoder/decoder - # model we must explicitly specify enforce_eager=True in the VllmRunner - # constructor. - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, max_tokens, num_logprobs) - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE - else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) - - @pytest.mark.parametrize("model", MODELS) - @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) - @pytest.mark.parametrize("max_tokens", [64]) - @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) - def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, - model, dtype, max_tokens, num_logprobs, - decoder_prompt_type) -> None: - - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - @multi_gpu_test(num_gpus=2) - @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) - @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) - @pytest.mark.parametrize("dtype", ["float"]) - @pytest.mark.parametrize("max_tokens", [64]) - @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) - def test_models_distributed(hf_runner, vllm_runner, - example_encoder_decoder_prompts, - distributed_executor_backend, model, dtype, - max_tokens, num_logprobs, - decoder_prompt_type) -> None: - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - ) +import pytest +from transformers import AutoModelForSeq2SeqLM + +from vllm.sequence import SampleLogprobs + +from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt, + HfRunner, VllmRunner) +from ....utils import multi_gpu_test +from ...utils import check_logprobs_close + +MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] + + +def vllm_to_hf_output( + vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, +): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "" + if decoder_prompt_type == DecoderPromptType.NONE: + hf_output_str = "" + hf_output_str + + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + decoder_prompt_type: DecoderPromptType, + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + ''' + Test the vLLM BART model for a variety of encoder/decoder input prompts, + by validating it against HuggingFace (HF) BART. + + Arguments: + + * hf_runner: HuggingFace (HF) test model runner + * vllm_runner: vLLM test model runner + * example_encoder_decoder_prompts: test fixture which provides a + dictionary of dummy prompts + * model: the HF ID of the specific BART variant under test + * dtype: the tensor datatype to employ + * max_tokens + * num_logprobs + * decoder_prompt_type: key into the example_encoder_decoder_prompts + dictionary; selects specific encoder/decoder + prompt scenarios to test + + A note on using HF BART as a baseline for validating vLLM BART, + specifically when the decoder prompt is None. + + The HF GenerationMixin's default behavior is to force the first + decoded token to be if the prompt does not already contain + (this is accomplished using a logit + processor setting.) + + So when we use HF BART as our baseline for comparison, note that + when the user provides a request with a None decoder prompt + (i.e. a singleton encoder prompt, or else an explicit encoder/ + decoder prompt with the decoder sub-prompt set to None), HF and + vLLM handle this in different ways: + + * HF will (1) tokenize the None prompt as an empty token-list, + (2) append to the beginning, yielding + [], (3) pass this token list to the model, and + then (4) after computing logits during prefill, override the model + logits & force to be the first generated token. + + * vLLM will (1) tokenize the None prompt as [], (2) append decoder- + start-token to the beginning, yielding [], + (3) pass these tokens to the model & proceed with generation. + + The net effect is that compared to vLLM, the list of HF *decoded* tokens + will contain one more initial than the vLLM generated tokens, + because vLLM's token is injected into the prompt rather than into + the generated output. This is in spite of the fact that overall, the + complete sequences (prompt + decoded tokens) produced by vLLM will match + HF. + + So when we use HF decoded token output to validate vLLM's decoded token + output, the testing process must account for the difference in decoded + token sequences between vLLM and HF specifically in the + decoder-prompt-is-None case. + + One option is to disable the logit processor feature that forces the + token to be decoded (forced_bos_token_id = None), eliminating + the problem entirely. However this is not "normal" BART usage. + + The other option is - only in the decoder-prompt-is-None case - to + discard the first decoded token from the HF output before comparing it + to vLLM. + + To that end, when testing the scenario where the decoder prompt is None + (and only in that one scenario), this test skips the first HF decoded + token during the process of validating the vLLM decoded output. + ''' + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default). + + # Note: currently encoder/decoder models are only compatible with + # enforce_eager=True. Normally this is not a problem because + # for encoder/decoder models vLLM will + # default to enforce_eager=True if enforce_eager + # is left unspecified. However, the + # VllmRunner test fixture (which wraps around the LLM class) defaults to + # enforce_eager=False (a behavior which a number of already-exisitng + # decoder-only unit tests expect), so when testing an encoder/decoder + # model we must explicitly specify enforce_eager=True in the VllmRunner + # constructor. + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + prompts, max_tokens, num_logprobs) + + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + + hf_skip_tokens = (1 + if decoder_prompt_type == DecoderPromptType.NONE else 0) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, + dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: + + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) +def test_models_distributed(hf_runner, vllm_runner, + example_encoder_decoder_prompts, + distributed_executor_backend, model, dtype, + max_tokens, num_logprobs, + decoder_prompt_type) -> None: + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 2a215331704c1..ef8d576616838 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -75,6 +75,22 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): slot_mapping: torch.Tensor seq_lens: Optional[List[int]] + # Begin encoder attn & enc/dec cross-attn fields... + # Encoder sequence lengths representation + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + def __post_init__(self): # Set during the execution of the first attention op. # It is a list because it is needed to set per prompt @@ -82,6 +98,28 @@ def __post_init__(self): # from xformer API. # will not appear in the __repr__ and __init__ self.attn_bias: Optional[List[torch.Tensor]] = None + self.encoder_attn_bias: Optional[List[torch.Tensor]] = None + self.cross_attn_bias: Optional[List[torch.Tensor]] = None + + @property + def is_all_encoder_attn_metadata_set(self): + ''' + All attention metadata required for encoder attention is set. + ''' + return ((self.encoder_seq_lens is not None) + and (self.encoder_seq_lens_tensor is not None) + and (self.max_encoder_seq_len is not None)) + + @property + def is_all_cross_attn_metadata_set(self): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return (self.is_all_encoder_attn_metadata_set + and (self.cross_slot_mapping is not None) + and (self.cross_block_tables is not None)) @property def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: @@ -101,6 +139,136 @@ def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: return self + def get_seq_lens( + self, + attn_type: AttentionType, + ): + ''' + Extract appropriate sequence lengths from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate sequence lengths tensor for query + * Appropriate sequence lengths tensor for key & value + ''' + + if attn_type == AttentionType.DECODER: + seq_lens_q = self.seq_lens + seq_lens_kv = self.seq_lens + elif attn_type == AttentionType.ENCODER: + seq_lens_q = self.encoder_seq_lens + seq_lens_kv = self.encoder_seq_lens + elif attn_type == AttentionType.ENCODER_DECODER: + seq_lens_q = self.seq_lens + seq_lens_kv = self.encoder_seq_lens + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + return seq_lens_q, seq_lens_kv + + def get_attn_bias( + self, + attn_type: AttentionType, + ) -> Optional[List[torch.Tensor]]: + ''' + Extract appropriate attention bias from attention metadata + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + * Appropriate attention bias value given the attention type + ''' + + if attn_type == AttentionType.DECODER: + return self.attn_bias + elif attn_type == AttentionType.ENCODER: + return self.encoder_attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + return self.cross_attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def set_attn_bias( + self, + attn_bias: List[torch.Tensor], + attn_type: AttentionType, + ) -> None: + ''' + Update appropriate attention bias field of attention metadata, + according to attention type. + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * attn_bias: The desired attention bias value + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + ''' + + if attn_type == AttentionType.DECODER: + self.attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER: + self.encoder_attn_bias = attn_bias + elif attn_type == AttentionType.ENCODER_DECODER: + self.cross_attn_bias = attn_bias + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + def get_seq_len_block_table_args( + self, + attn_type: AttentionType, + ) -> tuple: + ''' + The particular choice of sequence-length- and block-table-related + attributes which should be extracted from attn_metadata is dependent + on the type of attention operation. + + Decoder attn -> select entirely decoder self-attention-related fields + Encoder/decoder cross-attn -> select encoder sequence lengths & + cross-attn block-tables fields + Encoder attn -> select encoder sequence lengths fields & no block tables + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention + * is_prompt: True if prefill, False otherwise + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + + * Appropriate sequence-lengths tensor + * Appropriate max sequence-length scalar + * Appropriate block tables (or None) + ''' + + if attn_type == AttentionType.DECODER: + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + return (self.seq_lens_tensor, self.max_decode_seq_len, + self.block_tables) + elif attn_type == AttentionType.ENCODER_DECODER: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + self.cross_block_tables) + elif attn_type == AttentionType.ENCODER: + # No block tables associated with encoder attention + return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, + None) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): @@ -171,84 +339,101 @@ def forward( shape = [num_tokens, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "TorchSDPABackendImpl") - num_tokens, hidden_size = query.shape + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") + # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if kv_cache.numel() > 0: + if key is not None: + assert value is not None + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + else: + assert value is None + + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): + # KV-cache during decoder-self- or + # encoder-decoder-cross-attention, but not + # during encoder attention. + # + # Even if there are no new key/value pairs to cache, + # we still need to break out key_cache and value_cache + # i.e. for later use by paged attention key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, k_scale, - v_scale) - if attn_metadata.is_prompt: + if (key is not None) and (value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + # During cross-attention decode, key & value will be None, + # preventing this IF-statement branch from running + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + PagedAttention.write_to_paged_cache(key, value, key_cache, + value_cache, + updated_slot_mapping, + self.kv_cache_dtype, + k_scale, v_scale) + + if attn_type != AttentionType.ENCODER: + # Decoder self-attention supports chunked prefill. + # Encoder/decoder cross-attention requires no chunked + # prefill (100% prefill or 100% decode tokens, no mix) + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + else: + # Encoder attention - chunked prefill is not applicable; + # derive token-count from query shape & and treat them + # as 100% prefill tokens + assert attn_metadata.num_encoder_tokens is not None + num_prefill_tokens = attn_metadata.num_encoder_tokens + num_decode_tokens = 0 + + if attn_type == AttentionType.DECODER: + # Only enforce this shape-constraint for decoder + # self-attention + assert key.shape[0] == num_prefill_tokens + num_decode_tokens + assert value.shape[0] == num_prefill_tokens + num_decode_tokens + + if prefill_meta := attn_metadata.prefill_metadata: assert attn_metadata.seq_lens is not None if (kv_cache.numel() == 0 - or attn_metadata.block_tables.numel() == 0): - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, - dim=1) - - if attn_metadata.attn_bias is None: - if self.alibi_slopes is not None: - att_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - att_masks = [None] * len(attn_metadata.seq_lens) - attn_metadata.attn_bias = att_masks - - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - - start = 0 - output = torch.empty( - (num_tokens, self.num_heads, self.head_size), - dtype=query.dtype) - for seq_len, mask in zip(attn_metadata.seq_lens, - attn_metadata.attn_bias): - end = start + seq_len - sub_out = scaled_dot_product_attention( - query[None, :, start:end, :], - key[None, :, start:end, :], - value[None, :, start:end, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=not self.need_mask, - scale=self.scale).squeeze(0).movedim( - query.dim() - 2, 0) - output[start:end, :, :] = sub_out - start = end + or prefill_meta.block_tables.numel() == 0): + output = self._run_sdpa_forward(query, + key, + value, + prefill_meta, + attn_type=attn_type) else: # prefix-enabled attention raise RuntimeError( "Torch SDPA backend doesn't support prefix decoding.") - else: + if decode_meta := attn_metadata.decode_metadata: # Decoding run. + ( + seq_lens_arg, + max_seq_len_arg, + block_tables_arg, + ) = decode_meta.get_seq_len_block_table_args(attn_type) + output = PagedAttention.forward_decode( query, key_cache, value_cache, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - attn_metadata.max_decode_seq_len, + block_tables_arg, + seq_lens_arg, + max_seq_len_arg, self.kv_cache_dtype, self.num_kv_heads, self.scale, @@ -260,6 +445,59 @@ def forward( # Reshape the output tensor. return output.view(-1, self.num_heads * self.head_size) + def _run_sdpa_forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: TorchSDPAMetadata, + attn_type: AttentionType = AttentionType.DECODER, + ): + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=1) + value = value.repeat_interleave(self.num_queries_per_kv, dim=1) + + attn_masks = attn_metadata.get_attn_bias(attn_type) + if attn_masks is None: + if self.alibi_slopes is not None: + attn_masks = _make_alibi_bias( + self.alibi_slopes, query.dtype, + attn_metadata.seq_lens) # type: ignore + elif self.sliding_window is not None: + assert attn_metadata.seq_lens is not None + attn_masks = _make_sliding_window_bias( + attn_metadata.seq_lens, self.sliding_window, + query.dtype) # type: ignore + else: + seq_lens, _ = attn_metadata.get_seq_lens(attn_type) + attn_masks = [None] * len(seq_lens) + attn_metadata.set_attn_bias(attn_masks, attn_type) + + output = torch.empty_like(query) + query = query.movedim(0, query.dim() - 2) + key = key.movedim(0, key.dim() - 2) + value = value.movedim(0, value.dim() - 2) + + causal_attn = (attn_type == AttentionType.DECODER) + + seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) + start_q, start_kv = 0, 0 + for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, + attn_masks): + end_q = start_q + seq_len_q + end_kv = start_kv + seq_len_kv + sub_out = scaled_dot_product_attention( + query[None, :, start_q:end_q, :], + key[None, :, start_kv:end_kv, :], + value[None, :, start_kv:end_kv, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=causal_attn and not self.need_mask, + scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) + output[start_q:end_q, :, :] = sub_out + start_q, start_kv = end_q, end_kv + return output + def _make_alibi_bias( alibi_slopes: torch.Tensor, diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py new file mode 100644 index 0000000000000..8ebbf6db939bc --- /dev/null +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -0,0 +1,311 @@ +import dataclasses +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast + +import torch + +from vllm.attention import AttentionMetadata +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.multimodal import MultiModalInputs +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata +from vllm.utils import make_tensor_with_pad +from vllm.worker.cpu_model_runner import (CPUModelRunner, + ModelInputForCPUBuilder, + ModelInputForCPUWithSamplingMetadata) +from vllm.worker.model_runner_base import ( + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + + +@dataclasses.dataclass(frozen=True) +class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata): + """ + Used by the EncoderDecoderModelRunner. + """ + encoder_input_tokens: Optional[torch.Tensor] = None + encoder_input_positions: Optional[torch.Tensor] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "encoder_input_tokens": self.encoder_input_tokens, + "encoder_input_positions": self.encoder_input_positions, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "EncoderDecoderModelInputForCPU": + return cast( + EncoderDecoderModelInputForCPU, + super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) + + +class CPUEncoderDecoderModelRunner(CPUModelRunner): + _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( + EncoderDecoderModelInputForCPU) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + def _list_to_int32_tensor( + self, + _list: List[int], + ) -> torch.Tensor: + return torch.tensor(_list, dtype=torch.int32, device=self.device) + + def _list_to_long_tensor( + self, + _list: List[int], + ) -> torch.Tensor: + return torch.tensor(_list, dtype=torch.long, device=self.device) + + def _empty_int32_tensor(self) -> torch.Tensor: + return self._list_to_int32_tensor([]) + + def _empty_long_tensor(self) -> torch.Tensor: + return self._list_to_long_tensor([]) + + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, + Any]) -> EncoderDecoderModelInputForCPU: + return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> EncoderDecoderModelInputForCPU: + model_input = super().prepare_model_input(seq_group_metadata_list, + virtual_engine, + finished_requests_ids) + model_input = cast(EncoderDecoderModelInputForCPU, model_input) + ( + attn_metadata, + encoder_input_tokens_tensor, + encoder_input_positions_tensor, + ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, + model_input) + return dataclasses.replace( + model_input, + attn_metadata=attn_metadata, + encoder_input_tokens=encoder_input_tokens_tensor, + encoder_input_positions=encoder_input_positions_tensor, + ) + + def _prepare_encoder_model_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: EncoderDecoderModelInputForCPU, + ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], + Optional[torch.Tensor]]: + """Helper method to prepare the encoder- and cross-attn-related + model inputs based on a given sequence group. These additional inputs + are used to augment an already-computed `EncoderDecoderModelInput` + data structure which already has decoder-related model inputs + populated. + + Sets the following attn_metadata fields: + * `num_encoder_tokens` + * `encoder_seq_lens` + * `encoder_seq_lens_tensor` + * `max_encoder_seq_len` + * `cross_slot_mapping` + * `cross_block_tables` + + Constructs a new model inputs data structure, based on + (1) the existing fields in the `model_inputs` argument, + and (2) the following additional fields which are + computed (or in the case of `attn_metadata`, updated) + by this function: + * attn_metadata + * encoder_input_tokens + * encoder_input_positions + + Arguments: + + * seq_group_metadata_list: list of sequence groups for which to + compute inputs + * model_inputs: model inputs data structure with decoder-oriented + fields already computed. + + Return: + + * Updated model inputs data structure + """ + + if len(seq_group_metadata_list) == 0: + return (model_input.attn_metadata, None, None) + + # Since we are not supporting chunked prefill either the entire + # batch is prefill or it is decode + is_prompt = seq_group_metadata_list[0].is_prompt + + # Build encoder inputs + encoder_seq_lens: List[int] = [] + if is_prompt: + # Prefill phase. + cross_block_tables = self._empty_int32_tensor().view( + len(seq_group_metadata_list), -1) + + # Extract input tokens/positions, cross-attention slot-mapping, + # & seq len from each sequence group metadata + ( + encoder_input_tokens, + encoder_input_positions, + cross_slot_mapping, + ) = ( + [], + [], + [], + ) + for seq_group_metadata in seq_group_metadata_list: + # Build seq lens + seq_len = seq_group_metadata.encoder_seq_data.get_len() + token_ids = seq_group_metadata.encoder_seq_data.get_token_ids() + encoder_seq_lens.append(seq_len) + + # Build slot mapping + for i in range(0, seq_len): + block_number = seq_group_metadata.cross_block_table[ + i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + cross_slot_mapping.append(slot) + + # Build encoder input tokens + encoder_input_tokens.extend(token_ids) + encoder_input_positions.extend(list(range(0, seq_len))) + + # Convert tokens/positions & cross-attention + # slot-mapping to encoder input tensors + encoder_input_tokens_tensor = self._list_to_long_tensor( + encoder_input_tokens) + encoder_input_positions_tensor = self._list_to_long_tensor( + encoder_input_positions) + cross_slot_mapping_tensor = self._list_to_long_tensor( + cross_slot_mapping) + + else: + # Decode phase. + encoder_input_tokens_tensor = self._empty_long_tensor() + encoder_input_positions_tensor = self._empty_long_tensor() + cross_slot_mapping_tensor = self._empty_long_tensor() + # Extract cross-attention block tables & + # seq len from each sequence group metadata. + # Cross-attention block tables are empty + # during vLLM memory profiling. + cross_block_tables = [] + for seq_group_metadata in seq_group_metadata_list: + for _ in range(len(seq_group_metadata.seq_data)): + encoder_seq_lens.append( + seq_group_metadata.encoder_seq_data.get_len()) + cross_block_table = seq_group_metadata.cross_block_table + cross_block_tables.append([] if ( + cross_block_table is None) else cross_block_table) + + max_len_of_block_table = max( + len(block_table) for block_table in cross_block_tables) + + cross_block_tables = make_tensor_with_pad( + cross_block_tables, + max_len=max_len_of_block_table, + pad=0, + dtype=torch.int32, + device=self.device, + ) + + # Compute encoder sequence lengths & encoder + # sequence starting offset tensors + max_encoder_seq_len = max(encoder_seq_lens, default=0) + encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) + encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + + 1, + dtype=torch.int32, + device=self.device) + torch.cumsum(encoder_seq_lens_tensor, + dim=0, + dtype=encoder_seq_start_loc.dtype, + out=encoder_seq_start_loc[1:]) + + # Update attention metadata with encoder-oriented attributes + attn_metadata = model_input.attn_metadata + assert attn_metadata is not None + ( + attn_metadata.num_encoder_tokens, + attn_metadata.encoder_seq_lens, + attn_metadata.encoder_seq_lens_tensor, + attn_metadata.max_encoder_seq_len, + attn_metadata.cross_slot_mapping, + attn_metadata.cross_block_tables, + ) = ( + sum(encoder_seq_lens), + encoder_seq_lens, + encoder_seq_lens_tensor, + max_encoder_seq_len, + cross_slot_mapping_tensor, + cross_block_tables, + ) + + return (attn_metadata, encoder_input_tokens_tensor, + encoder_input_positions_tensor) + + @torch.no_grad() + def execute_model( + self, + model_input: EncoderDecoderModelInputForCPU, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[List[SamplerOutput]]: + if num_steps > 1: + raise ValueError( + "CPU worker does not support multi-step execution.") + + model_executable = self.model + execute_model_kwargs = { + "input_ids": + model_input.input_tokens, + "positions": + model_input.input_positions, + "encoder_input_ids": + model_input.encoder_input_tokens, + "encoder_positions": + model_input.encoder_input_positions, + "kv_caches": + kv_caches, + "attn_metadata": + model_input.attn_metadata, + **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + device=self.device), + "intermediate_tensors": + intermediate_tensors, + } + + hidden_states = model_executable(**execute_model_kwargs) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) + + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return [] + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + return [output] diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 534d167d994fe..a03c562532179 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -19,7 +19,7 @@ MultiModalInputs) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS, make_tensor_with_pad +from vllm.utils import make_tensor_with_pad from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -434,10 +434,6 @@ def __init__( # Lazy initialization. self.model: nn.Module # Set after init_Model - if self.model_config.is_encoder_decoder_model: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CPU']) - @property def model_is_mrope(self) -> bool: """Detect if the model has "mrope" rope_scaling type. @@ -459,8 +455,8 @@ def load_model(self) -> None: def make_model_input_from_broadcasted_tensor_dict( self, tensor_dict: Dict[str, Any], - ) -> ModelInputForCPU: - return ModelInputForCPU.from_broadcasted_tensor_dict( + ) -> ModelInputForCPUWithSamplingMetadata: + return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 tensor_dict, attn_backend=self.attn_backend, ) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 5e36fba6ccdea..7384ffcb2c5e5 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,5 +1,5 @@ """A CPU worker class.""" -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Type import torch import torch.distributed @@ -15,6 +15,7 @@ from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner from vllm.worker.cpu_model_runner import CPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerInput) @@ -163,7 +164,10 @@ def __init__( else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] - self.model_runner: CPUModelRunner = CPUModelRunner( + ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner + if self._is_encoder_decoder_model(): + ModelRunnerClass = CPUEncoderDecoderModelRunner + self.model_runner: CPUModelRunner = ModelRunnerClass( model_config, parallel_config, scheduler_config, @@ -205,6 +209,9 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() + def _is_encoder_decoder_model(self): + return self.model_config.is_encoder_decoder_model + def init_device(self) -> None: if self.local_omp_cpuid != "all": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) From f19da64871065510691cd4fcaa5f4096b661dcec Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Oct 2024 18:01:46 +0800 Subject: [PATCH 06/12] [Core] Refactor GGUF parameters packing and forwarding (#8859) --- .../models/decoder_only/language/test_gguf.py | 12 +-- vllm/model_executor/layers/linear.py | 76 ++++++++----------- .../layers/quantization/gguf.py | 36 ++++++--- vllm/model_executor/models/llama.py | 2 +- 4 files changed, 64 insertions(+), 62 deletions(-) diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 8fc64a10c84af..5dc83942632fd 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -19,12 +19,12 @@ # FIXME: Move this to confest MODELS = [ - ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", - hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", - filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")), - ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", - hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF", - filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")), + ("meta-llama/Llama-3.2-1B-Instruct", + hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", + filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")), + ("meta-llama/Llama-3.2-1B-Instruct", + hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", + filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")), ("Qwen/Qwen2-1.5B-Instruct", hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", filename="qwen2-1_5b-instruct-q4_k_m.gguf")), diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 568892778abe2..c162ab81c5530 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -440,17 +440,23 @@ def weight_loader(self, param.shard_weight_type[loaded_shard_id] = loaded_weight.item() return - if is_gguf_weight and isinstance(param, UninitializedParameter): - from gguf.constants import GGML_QUANT_SIZES + if is_gguf_weight: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // tp_size + start_idx = tp_rank * shard_size + + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) - ori_shape = param.tensor_shape - weight_types = self.qweight_type.shard_weight_type.values() - row_size = [] - for weight_type in weight_types: - block_size, type_size = GGML_QUANT_SIZES[weight_type] - row_size.append(ori_shape[1] // block_size * type_size) - q_shape = (ori_shape[0], max(row_size)) - param.materialize(q_shape, dtype=loaded_weight.dtype) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 2: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -515,18 +521,6 @@ def weight_loader(self, shard_offset = loaded_weight.shape[output_dim] * \ loaded_shard_id - if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - output_dim = getattr(param, "output_dim", None) - shard_shape = list(loaded_weight.shape) - shard_shape[output_dim] = shard_shape[output_dim] // tp_size - param.shard_id.append(loaded_shard_id) - param.shard_size[loaded_shard_id] = shard_shape - - input_dim = getattr(param, "input_dim", None) - input_size = loaded_weight.shape[input_dim] - param_data = param_data.narrow(input_dim, 0, input_size) - param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size @@ -783,17 +777,23 @@ def weight_loader(self, param.shard_weight_type[loaded_shard_id] = loaded_weight.item() return - if is_gguf_weight and isinstance(param, UninitializedParameter): - from gguf.constants import GGML_QUANT_SIZES + if is_gguf_weight: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() - ori_shape = param.tensor_shape - weight_types = self.qweight_type.shard_weight_type.values() - row_size = [] - for weight_type in weight_types: - block_size, type_size = GGML_QUANT_SIZES[weight_type] - row_size.append(ori_shape[1] // block_size * type_size) - q_shape = (ori_shape[0], max(row_size)) - param.materialize(q_shape, dtype=loaded_weight.dtype) + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // tp_size + start_idx = tp_rank * shard_size + + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 3: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -883,18 +883,6 @@ def weight_loader(self, shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( param, orig_qkv_offsets, loaded_shard_id) - if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - output_dim = getattr(param, "output_dim", None) - shard_shape = list(loaded_weight.shape) - shard_shape[output_dim] = shard_shape[output_dim] // tp_size - param.shard_id.append(loaded_shard_id) - param.shard_size[loaded_shard_id] = shard_shape - - input_dim = getattr(param, "input_dim", None) - input_size = loaded_weight.shape[input_dim] - param_data = param_data.narrow(input_dim, 0, input_size) - param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index dc83017bcc7f9..d73b9f6d92832 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -86,15 +86,16 @@ def create_weights(self, layer: torch.nn.Module, output_size_per_partition = sum(output_partition_sizes) tensor_shape = (output_size_per_partition, input_size_per_partition) - qweight = UninitializedParameter(requires_grad=False) + qweight = GGUFUninitializedParameter(requires_grad=False) set_weight_attrs( qweight, { "input_dim": 1, "output_dim": 0, "tensor_shape": tensor_shape, "is_gguf_weight": True, - "shard_size": {}, + "data_container": [], "shard_id": [], + "shard_id_map": {}, }) set_weight_attrs(qweight, extra_weight_attrs) layer.register_parameter("qweight", qweight) @@ -116,21 +117,17 @@ def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - shard_size = getattr(layer.qweight, "shard_size", None) shard_id = getattr(layer.qweight, "shard_id", None) - if shard_id and shard_size: - result = [] - offset = 0 + if shard_id: # dequantize shard weights respectively shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id + qweight = layer.qweight.unbind(0) + result = [] for id in shard_id: - shard_weight = layer.qweight[ - offset:offset + - shard_size[id][0], :shard_size[id][1]].contiguous() + q_idx = layer.qweight.shard_id_map[id] qweight_type = layer.qweight_type.shard_weight_type[id] - result.append(_fuse_mul_mat(x, shard_weight, qweight_type)) - offset += shard_size[id][0] + result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type)) out = torch.cat(result, axis=1) else: qweight = layer.qweight @@ -162,3 +159,20 @@ def embedding(self, layer: torch.nn.Module, dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size, x_flat.shape[0]) return dequant.view(*x.shape, hidden_size) + + +class GGUFUninitializedParameter(UninitializedParameter): + cls_to_become = Parameter + data_container: List[torch.Tensor] + + def materialize_nested(self) -> Parameter: + nested_data = torch.nested.nested_tensor(self.data_container, + device=self.device, + dtype=torch.uint8) + self.data_container.clear() + param = torch.Tensor._make_subclass(self.cls_to_become, + nested_data, + require_grad=False) + for k, v in self.__dict__.items(): + setattr(param, k, v) + return param diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d591d20f7f2f2..8eacf73dd6322 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -512,7 +512,7 @@ def __init__( quant_config=quant_config, ) if config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight + self.lm_head = self.model.embed_tokens logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, From 151ef4efd2fb52554f4d30408aca619e181ea751 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 7 Oct 2024 19:55:12 +0800 Subject: [PATCH 07/12] [Model] Support NVLM-D and fix QK Norm in InternViT (#9045) Co-authored-by: Roger Wang Co-authored-by: Isotr0py --- docs/source/models/supported_models.rst | 9 + examples/offline_inference_vision_language.py | 55 +++- ...e_inference_vision_language_multi_image.py | 34 ++ vllm/entrypoints/chat_utils.py | 2 +- vllm/model_executor/layers/layernorm.py | 32 +- vllm/model_executor/models/intern_vit.py | 206 +++++++----- vllm/model_executor/models/internvl.py | 294 +++++++++++------- vllm/model_executor/models/nvlm_d.py | 64 ++++ vllm/model_executor/models/registry.py | 37 +-- vllm/transformers_utils/config.py | 7 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/nvlm_d.py | 12 + 12 files changed, 518 insertions(+), 236 deletions(-) create mode 100644 vllm/model_executor/models/nvlm_d.py create mode 100644 vllm/transformers_utils/configs/nvlm_d.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index dea109cb17f58..084607c155cb0 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -315,6 +315,9 @@ Multimodal Language Models .. _supported_vlms: +Text Generation +--------------- + .. list-table:: :widths: 25 25 25 25 5 5 :header-rows: 1 @@ -384,7 +387,13 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - + - + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - Image\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. - + - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index b94ef537d783f..efad7e33793df 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -18,7 +18,7 @@ # LLaVA-1.5 -def run_llava(question, modality): +def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -29,7 +29,7 @@ def run_llava(question, modality): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question, modality): +def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" @@ -40,7 +40,7 @@ def run_llava_next(question, modality): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question, modality): +def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: