Skip to content

Commit

Permalink
[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Ge…
Browse files Browse the repository at this point in the history
…mma2 OpenAI Server Crash (vllm-project#6501)
  • Loading branch information
noamgat authored Jul 18, 2024
1 parent 28ab106 commit 27f6444
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions vllm/attention/backends/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
compute_slot_mapping_start_idx,
is_block_tables_empty)
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.sequence import SequenceGroupMetadata
from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad

Expand Down Expand Up @@ -61,14 +62,14 @@ def swap_blocks(
dst_kv_cache: torch.Tensor,
src_to_dst: torch.Tensor,
) -> None:
raise NotImplementedError
PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)

@staticmethod
def copy_blocks(
kv_caches: List[torch.Tensor],
src_to_dists: torch.Tensor,
) -> None:
raise NotImplementedError
PagedAttention.copy_blocks(kv_caches, src_to_dists)

@staticmethod
def get_supported_head_sizes() -> List[int]:
Expand Down

0 comments on commit 27f6444

Please sign in to comment.