fix correctness

vllm-project · Jan 17, 2024 · bd56a69 · bd56a69
1 parent 29f4f96
commit bd56a69
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 1 deletion.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -672,6 +672,12 @@ def _process_model_outputs(
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
 
+        # Update prefix state, now all the uncomputed prefixes are computed.
+        for seq_group in scheduled_seq_groups:
+            if (seq_group.prefix is not None and seq_group.prefix.allocated
+                    and not seq_group.prefix.computed):
+                seq_group.prefix.computed = True
+
         if self.log_stats:
             # Log the system stats.
             self._log_system_stats(scheduler_outputs.prompt_run,

diff --git a/vllm/prefix.py b/vllm/prefix.py
@@ -24,6 +24,7 @@ def __init__(
         self.hash = hash(token_ids)
         assert self.length % block_size == 0
         self.block_table: Optional[BlockTable] = None
+        self.computed = False
 
     @property
     def allocated(self) -> bool:

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -97,7 +97,7 @@ def _prepare_prompt(
             prompt_lens.append(prompt_len)
             prefix_len = 0
             prefix = seq_group_metadata.prefix
-            if prefix is not None and prefix.allocated:
+            if prefix is not None and prefix.computed:
                 prefix_len = prefix.get_length()
                 prompt_tokens = prompt_tokens[prefix_len:]
                 prefix_block_tables.append(prefix.get_block_numbers())