flashinfer-ai · yzh119 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/python/csrc/batch_prefill.cu b/python/csrc/batch_prefill.cu
@@ -289,11 +289,11 @@ std::vector<torch::Tensor> BatchPrefillWithPagedKVCachePyTorchWrapper::ForwardCu
 
   if (paged_kv_defined) {
     // [max_num_pages, 2, num_kv_heads, page_size, head_dim] for HND
-    // [max_num_pages, 2, page_size, num_kv_heads, head_dim] for HND
+    // [max_num_pages, 2, page_size, num_kv_heads, head_dim] for NHD
     CHECK_DIM(5, paged_kv_cache.value());
   } else {
     // [max_num_pages, num_kv_heads, page_size, head_dim] for HND
-    // [max_num_pages, page_size, num_kv_heads, head_dim] for HND
+    // [max_num_pages, page_size, num_kv_heads, head_dim] for NHD
     CHECK_DIM(4, paged_k_cache.value());
     CHECK_DIM(4, paged_v_cache.value());
   }

diff --git a/python/flashinfer/cascade.py b/python/flashinfer/cascade.py
@@ -374,7 +374,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         allow_fp16_qk_reduction : bool
@@ -631,7 +631,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool

diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -577,7 +577,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         pos_encoding_mode : str
@@ -696,7 +696,7 @@ def forward_return_lse(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         pos_encoding_mode : str

diff --git a/python/flashinfer/page.py b/python/flashinfer/page.py
@@ -65,7 +65,7 @@ def append_paged_kv_cache(
           ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
           :attr:`kv_layout` is ``NHD``, and
           ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-          :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+          :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
           ``paged_kv_cache[:, 1]`` is the value-cache.
 
     kv_indices : torch.Tensor

diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -778,8 +778,8 @@ def begin_forward(
             self._paged_kv_indices_buf = paged_kv_indices
             self._paged_kv_last_page_len_buf = paged_kv_last_page_len
             if packed_custom_mask is not None:
-                self._custom_mask = packed_custom_mask
-                self._qk_indptr = qk_indptr
+                self._custom_mask_buf = packed_custom_mask
+                self._qk_indptr_buf = qk_indptr
         empty_q_data = torch.empty(
             0,
             dtype=(
@@ -843,7 +843,7 @@ def forward(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool
@@ -969,7 +969,7 @@ def forward_return_lse(
               ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
               :attr:`kv_layout` is ``NHD``, and
               ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
-              :attr:`kv_layout` is ``NHD``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
+              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
               ``paged_kv_cache[:, 1]`` is the value-cache.
 
         causal : bool