[AMD][Quark] Fix fails in pr checks

vllm-project · Dec 2, 2024 · 8837c74 · 8837c74
1 parent f0c2c8d
commit 8837c74
Show file tree

Hide file tree

Showing 19 changed files with 116 additions and 64 deletions.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -332,9 +332,10 @@ def get_scheme(
 
         return scheme
 
-    # move the get_compressed_tensors_cache_scale method from utils.py to instance
-    # method of CompressedTensorsConfig class. By doing this, different
-    # QuantizationConfig classes can implement their own get_cache_scale method.
+    # move the get_compressed_tensors_cache_scale method from 
+    # utils.py to instance method of CompressedTensorsConfig 
+    # class. By doing this, different QuantizationConfig 
+    # classes can implement their own get_cache_scale method.
     def get_cache_scale(self, name: str) -> Optional[List[str]]:
         """
         Check whether the param name matches the format for k/v cache scales

diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -8,9 +8,11 @@
 from quark.torch.quantization.config.type import QSchemeType, Dtype
 from quark.torch.quantization.config.config import (Config,
                                                     QuantizationSpec)
-from quark.torch.quantization.config.config import QuantizationConfig as QuarkQuantConfig
+from quark.torch.quantization.config.config import (
+    QuantizationConfig as QuarkQuantConfig)
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
 from vllm.model_executor.layers.quantization.quark.utils import (deep_compare,
                                                                  should_ignore_layer)
 
@@ -33,9 +35,11 @@ class QuarkConfig(QuantizationConfig):
 
     def __init__(self,
                  quant_config: Config,
-                 kv_cache_group: List[str] = [],
+                 kv_cache_group: Optional[List[str]] = None,
                  kv_cache_config: Optional[QuantizationSpec] = None,
                  pack_method: str = "reorder"):
+        if kv_cache_group is None:
+            kv_cache_group = []
         self.quant_config = quant_config
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
@@ -71,13 +75,17 @@ def get_quant_method(
         if isinstance(layer, Attention):
             return QuarkKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
+            return QuarkMoEMethod.get_moe_method(self, module=layer, 
+                                                 layer_name=prefix)
         return None
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
         quant_config = Config.from_dict(config)
         export_config = config.get("export")
+        if export_config is None:
+            raise ValueError("The export key should be included in "
+                             "the configurations of Quark quantized model")
         kv_cache_group = cast(List[str], export_config.get("kv_cache_group"))
         pack_method = cast(str, export_config.get("pack_method"))
 
@@ -89,11 +97,13 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
             layer_quant_set = set(layer_quant_names)
 
             if not kv_cache_set.issubset(layer_quant_set):
-                raise ValueError("The Quark quantized model has the kv_cache_group "
-                                 "parameter setting, but no kv_cache quantization settings "
-                                 "were found in the quantization configuration.")
+                raise ValueError("The Quark quantized model has the "
+                                 "kv_cache_group parameter setting, "
+                                 "but no kv_cache quantization settings "
+                                 "were found in the quantization "
+                                 "configuration.")
 
-            q_configs = [quant_config.layer_quant_config[name] for name in kv_cache_group]
+            q_configs = [quant_config.layer_quant_config[name] for name in kv_cache_group] 
             if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):
                 raise ValueError("The quantization method used for kv_cache should be the same, "
                                  "but the quantization method for the kv_cache layer in the "
@@ -179,7 +189,7 @@ def _is_static_tensor_w8a8(self,
         return is_int8_dtype and is_tensor and weight_quant.symmetric and is_static
 
     def _find_matched_config(self, 
-                            layer_name: Optional[str], 
+                            layer_name: str, 
                             module: torch.nn.Module) -> "QuarkQuantConfig":
 
         proj_name = layer_name.split(".")[-1]
@@ -232,7 +242,7 @@ def _get_scheme_from_config(self, config: QuarkQuantConfig) -> "QuarkScheme":
     def get_scheme(
             self,
             layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "QuarkScheme":
+            layer_name: str) -> "QuarkScheme":
 
         layer_quant_config = self._find_matched_config(layer_name, layer)
 

diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,6 +1,5 @@
-import enum
-from enum import Enum
-from typing import Callable, List, Optional
+
+from typing import Callable, Optional
 
 import torch
 from quark.torch.quantization.config.type import QSchemeType
@@ -22,15 +21,17 @@ class QuarkMoEMethod(FusedMoEMethodBase):
 
     @staticmethod
     def get_moe_method(
-        quant_config: "QuarkConfig",
+        quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
         module: torch.nn.Module,
         layer_name: str
     ) -> "QuarkMoEMethod":
-        layer_quant_config = quant_config._find_matched_config(layer_name, module)
+        layer_quant_config = quant_config._find_matched_config(layer_name, 
+                                                               module)
 
         if layer_quant_config.output_tensors or layer_quant_config.bias:
-            raise NotImplementedError("Currently, Quark models with output_tensors "
-                                      "and bias quantized are not supported")
+            raise NotImplementedError("Currently, Quark models with "
+                                      "output_tensors and bias "
+                                      "quantized are not supported")
         weight_config = layer_quant_config.weight
         input_config = layer_quant_config.input_tensors
 
@@ -54,7 +55,7 @@ def __init__(
             raise ValueError(
                 "For FP8 Fused MoE layers, only per-tensor scales"
                 "for weights and activations are supported. Found "
-                f"{self.weight_quant.qscheme.value}, {self.input_quant.qscheme.value}")
+                f"{self.weight_quant.qscheme.value}, {self.input_quant.qscheme.value}") # noqa E501
 
         self.static_input_scales = not self.input_quant.is_dynamic
 

diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -72,7 +72,7 @@ def process_weights_after_loading(self, layer) -> None:
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
 
         else:
-            raise ValueError(f"Unknown quantization strategy {self.strategy}")
+            raise ValueError(f"Unknown quantization scheme {self.qscheme}")
 
         # INPUT SCALE
         if self.is_static_input_scheme:

diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,11 +1,11 @@
 
 import re
-from typing import Dict, Any, Optional, Iterable
+from typing import Any, Optional, Iterable
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     FUSED_LAYER_NAME_MAPPING)
 
 def deep_compare(dict1: Any, dict2: Any) -> bool:
-    if type(dict1) != type(dict2):
+    if type(dict1) is not type(dict2):
         return False
     if isinstance(dict1, dict):
         if dict1.keys() != dict2.keys():

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
@@ -420,13 +420,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
 
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
@@ -450,13 +450,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
@@ -538,13 +538,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
@@ -314,13 +314,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
 
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
@@ -480,13 +480,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -387,13 +387,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -429,13 +429,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -1431,13 +1431,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 name = name.replace('patch_embedding.weight',
                                     'patch_embedding._linear.weight')
                 loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     updated_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
@@ -494,13 +494,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue

diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
@@ -624,13 +624,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            if scale_names := self.quant_config.get_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and 
+                (scale_names := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and 
+                # compressed-tensors quantization
                 for scale_name in scale_names:
                     param = params_dict[scale_name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
-                    loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
+                    loaded_weight = (loaded_weight if loaded_weight.dim()==0 
+                                     else loaded_weight[0])
                     weight_loader(param, loaded_weight)
                     loaded_params.add(scale_name)
                 continue