[Kernel] Support running GPTQ 8-bit models in Marlin (vllm-project#4533)

opendatahub-io · May 7, 2024 · 81c0f04 · 81c0f04
1 parent 49e083c
commit 81c0f04
Show file tree

Hide file tree

Showing 7 changed files with 553 additions and 324 deletions.
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -132,6 +132,7 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor &g_idx,
   torch::Tensor &perm,
   torch::Tensor &workspace,
+  int64_t num_bits,
   int64_t size_m,
   int64_t size_n,
   int64_t size_k,
@@ -141,7 +142,8 @@ torch::Tensor gptq_marlin_repack(
   torch::Tensor &b_q_weight,
   torch::Tensor &perm,
   int64_t size_k,
-  int64_t size_n);
+  int64_t size_n,
+  int64_t num_bits);
 #endif
 
 void squeezellm_gemm(