add notes and small fix

vllm-project · Jan 18, 2024 · 6b00283 · 6b00283
1 parent bd56a69
commit 6b00283
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 2 deletions.
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
@@ -235,7 +235,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
         if seq_group.prefix is not None:
             # make sure to swap in the prefix first
-            assert seq_group.prefix.allocated
+            assert seq_group.prefix.allocated and seq_group.prefix.computed
 
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
@@ -278,7 +278,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             for gpu_block in block_table:
                 if (seq_group.prefix is not None
                         and gpu_block in seq_group.prefix.block_table):
-                    # We do not swap out the prefix blocks.
+                    # NOTE: We do not swap out the prefix blocks for now.
                     self.gpu_allocator.free(gpu_block)
                     continue
 

diff --git a/vllm/prefix.py b/vllm/prefix.py
@@ -7,6 +7,9 @@ class Prefix:
     """Data and states associated with a prefix of prompt tokens for multiple
     sequence groups.
 
+    NOTE: This feature is experimental and may be replaced with automatic
+        prefix caching in the future.
+
     Args:
         prefix_id: The id of the prefix in the prefix pool.
         token_ids: The token ids of the prefix.
@@ -49,6 +52,9 @@ def set_block_table(self, block_table: BlockTable) -> None:
 class PrefixPool:
     """Manages all the prompt prefixes.
 
+    NOTE: This feature is experimental and may be replaced with automatic
+        prefix caching in the future.
+
     Args:
         block_size: The block size of the executed model.