From bd56a6938a9f6b873b7de9c5ae683ae49d0f5287 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Wed, 17 Jan 2024 23:44:42 +0000 Subject: [PATCH] fix correctness --- vllm/engine/llm_engine.py | 6 ++++++ vllm/prefix.py | 1 + vllm/worker/model_runner.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fd5a231a902ef..7072a8bbc5b3e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -672,6 +672,12 @@ def _process_model_outputs( request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) + # Update prefix state, now all the uncomputed prefixes are computed. + for seq_group in scheduled_seq_groups: + if (seq_group.prefix is not None and seq_group.prefix.allocated + and not seq_group.prefix.computed): + seq_group.prefix.computed = True + if self.log_stats: # Log the system stats. self._log_system_stats(scheduler_outputs.prompt_run, diff --git a/vllm/prefix.py b/vllm/prefix.py index 14266288f9092..985f8aa95a69f 100644 --- a/vllm/prefix.py +++ b/vllm/prefix.py @@ -24,6 +24,7 @@ def __init__( self.hash = hash(token_ids) assert self.length % block_size == 0 self.block_table: Optional[BlockTable] = None + self.computed = False @property def allocated(self) -> bool: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0f0ab5244b8ae..d290886506507 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -97,7 +97,7 @@ def _prepare_prompt( prompt_lens.append(prompt_len) prefix_len = 0 prefix = seq_group_metadata.prefix - if prefix is not None and prefix.allocated: + if prefix is not None and prefix.computed: prefix_len = prefix.get_length() prompt_tokens = prompt_tokens[prefix_len:] prefix_block_tables.append(prefix.get_block_numbers())