vllm-project · jikunshang · Jul 15, 2024 · Jul 16, 2024
@@ -162,6 +162,11 @@ def apply(self,
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])
 
+        if use_ipex():
+            # detail api depends on ipex, it may fuse bias.
+            out = ipex_awq_gemm(x, qweight, scales, qzeros, pack_factor, ...)
+            return out
+
         # num_tokens >= threshold
         FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
 

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -224,6 +224,14 @@ def is_xpu() -> bool:
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
+@lru_cache(maxsize=None)
+def use_ipex() -> bool:
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""

diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
@@ -74,6 +74,14 @@ def from_broadcasted_tensor_dict(
                 attn_backend, tensor_dict)
         return cls(**tensor_dict)
 
+# ipex (Both CPU and XPU) will have some optimization on model weight layout 
+# to fully leverage hardware potential. We want to add cpu quant optimization
+# cpu would performs best with a specific weight layout (which is different to cuda device layout), 
+# so a repack api should be called.
+def IpexXPUModelQuantWeightWrapper():
+    # todo: for quant model, call ipex repack API. 
+    # eg: return ipex.repack_awq(model)
+    pass
 
 class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
 
@@ -144,6 +152,8 @@ def load_model(self) -> None:
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
+        self.model = IpexXPUModelQuantWeightWrapper(self.model)
+
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()