From 082ecd80d58c6604f44c0196cb9db5bc4befd6d7 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Sat, 20 Jul 2024 19:25:56 -0400 Subject: [PATCH] [ Bugfix ] Fix AutoFP8 fp8 marlin (#6609) --- .../layers/quantization/utils/marlin_utils_fp8.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index aabd46e64536f..c878939580f10 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -76,7 +76,8 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None: # WEIGHT SCALES # Currently Marlin doesn't support per-tensor scales, so we # expand it to channelwise - is_channelwise = layer.weight_scale.shape[0] == part_size_n + is_channelwise = (len(layer.weight_scale.shape) > 0 + and layer.weight_scale.shape[0] == part_size_n) if is_channelwise: scales = layer.weight_scale else: