From 082ecd80d58c6604f44c0196cb9db5bc4befd6d7 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 20 Jul 2024 19:25:56 -0400
Subject: [PATCH] [ Bugfix ] Fix AutoFP8 fp8 marlin (#6609)

---
 .../layers/quantization/utils/marlin_utils_fp8.py              | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index aabd46e64536f..c878939580f10 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -76,7 +76,8 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
     # WEIGHT SCALES
     # Currently Marlin doesn't support per-tensor scales, so we
     # expand it to channelwise
-    is_channelwise = layer.weight_scale.shape[0] == part_size_n
+    is_channelwise = (len(layer.weight_scale.shape) > 0
+                      and layer.weight_scale.shape[0] == part_size_n)
     if is_channelwise:
         scales = layer.weight_scale
     else: