diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 24a7e704bb35d..34f4368e09229 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -85,7 +85,8 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: # TODO(rickyx): potentially we could cache this so we don't have to # recompute it every time. - metadata_hash = None if not request.lora_request else hash(request.lora_request.lora_int_id) + metadata_hash = (None if not request.lora_request else + request.lora_request.lora_int_id) block_hashes = hash_request_tokens(self.block_size, request.all_token_ids, parent_hash=metadata_hash) @@ -379,8 +380,9 @@ def _cache_full_blocks( prev_block: The previous block in the chain. """ # Update the new blocks with the block hashes through the chain. - metadata_hash = None if request.lora_request is None else request.lora_request.lora_int_id - parent_hash = metadata_hash + metadata_hash = (None if request.lora_request is None else + request.lora_request.lora_int_id) + parent_hash = metadata_hash if prev_block is not None: # Previous block must have a block hash because it must be # a full, cached block. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9d5ebd44e57fb..f8d19fbd0650d 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -182,9 +182,10 @@ def hash_block_tokens(parent_hash: Optional[int], tuple(curr_block_token_ids)) -def hash_request_tokens(block_size: int, - token_ids: Sequence[int], - parent_hash: Optional[int] = None) -> List[BlockHashType]: +def hash_request_tokens( + block_size: int, + token_ids: Sequence[int], + parent_hash: Optional[int] = None) -> List[BlockHashType]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. @@ -204,8 +205,7 @@ def hash_request_tokens(block_size: int, # Do not hash the block if it is not full. if len(block_token_ids) < block_size: break - block_hash = hash_block_tokens(parent_hash, - block_token_ids) + block_hash = hash_block_tokens(parent_hash, block_token_ids) ret.append(block_hash) parent_hash = block_hash.hash_value return ret