[Perf] Mem align KV caches for CUDA devices (MLA perf improvement) (v…

…llm-project#12676) Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: Lucas Wilkinson <lcwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: simon-mo <xmo@berkeley.edu>
kerthcet · Feb 21, 2025 · 9345530 · 9345530
1 parent 20d8424
commit 9345530
Show file tree

Hide file tree

Showing 10 changed files with 429 additions and 34 deletions.
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -15,6 +15,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                  std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping);
+
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
   char* src_ptr = static_cast<char*>(src.data_ptr());
   char* dst_ptr = static_cast<char*>(dst.data_ptr());
 
-  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
   const at::cuda::OptionalCUDAGuard device_guard(
       src_device.is_cuda() ? src_device : dst_device);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
   }
 }
 
+// Kernel for MLA, which works on a single joint kv_cache
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_mla_kernel(
+    int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
+    const int mem_footprint_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+  scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
+  int64_t src_block = block_mapping[2 * pair_idx];
+  int64_t dst_block = block_mapping[2 * pair_idx + 1];
+  int64_t src_offset = src_block * mem_footprint_per_block;
+  int64_t dst_offset = dst_block * mem_footprint_per_block;
+  for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
+    cache[dst_offset + i] = cache[src_offset + i];
+  }
+}
+
 }  // namespace vllm
 
 // Note: the key_caches and value_caches vectors are constant but
@@ -147,6 +168,42 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
       }));
 }
 
+// copy blocks kernel for MLA (assumes a joint KV-cache)
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping) {
+  int num_layers = kv_caches.size();
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = kv_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
+
+  std::vector<int64_t> cache_ptrs(num_layers);
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
+  }
+  torch::Tensor cache_ptrs_tensor =
+      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  int num_pairs = block_mapping.size(0);
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  int mem_footprint_per_block = kv_caches[0].stride(0);
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, mem_footprint_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
+        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
+      }));
+}
+
 namespace vllm {
 
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@@ -254,6 +311,7 @@ __global__ void concat_and_cache_mla_kernel(
                                      // + pe_dim)]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride,                    //
+    const int entry_stride,                    //
     const int kv_c_stride,                     //
     const int k_pe_stride,                     //
     const int kv_lora_rank,                    //
@@ -274,9 +332,8 @@ __global__ void concat_and_cache_mla_kernel(
                   int src_stride, int dst_stride, int size, int offset) {
     for (int i = threadIdx.x; i < size; i += blockDim.x) {
       const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx = block_idx * block_stride +
-                              block_offset * (kv_lora_rank + pe_dim) + i +
-                              offset;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
         dst[dst_idx] = src[src_idx];
       } else {
@@ -391,14 +448,14 @@ void reshape_and_cache_flash(
 // KV_T is the stored data type of kv-cache.
 // CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
-  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
-      <<<grid, block, 0, stream>>>(                                    \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
-          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
-          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
 void concat_and_cache_mla(
@@ -428,6 +485,7 @@ void concat_and_cache_mla(
   int kv_c_stride = kv_c.stride(0);
   int k_pe_stride = k_pe.stride(0);
   int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
 
   dim3 grid(num_tokens);
   dim3 block(std::min(kv_lora_rank, 512));

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -450,6 +450,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 
+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
+
   // Reshape the key and value tensors and cache them.
   cache_ops.def(
       "reshape_and_cache(Tensor key, Tensor value,"