Skip to content

Commit

Permalink
[AMD][Quark] Fix fails in pr checks
Browse files Browse the repository at this point in the history
  • Loading branch information
kewang2 committed Dec 2, 2024
1 parent f0c2c8d commit 8837c74
Show file tree
Hide file tree
Showing 19 changed files with 116 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,10 @@ def get_scheme(

return scheme

# move the get_compressed_tensors_cache_scale method from utils.py to instance
# method of CompressedTensorsConfig class. By doing this, different
# QuantizationConfig classes can implement their own get_cache_scale method.
# move the get_compressed_tensors_cache_scale method from
# utils.py to instance method of CompressedTensorsConfig
# class. By doing this, different QuantizationConfig
# classes can implement their own get_cache_scale method.
def get_cache_scale(self, name: str) -> Optional[List[str]]:
"""
Check whether the param name matches the format for k/v cache scales
Expand Down
30 changes: 20 additions & 10 deletions vllm/model_executor/layers/quantization/quark/quark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from quark.torch.quantization.config.type import QSchemeType, Dtype
from quark.torch.quantization.config.config import (Config,
QuantizationSpec)
from quark.torch.quantization.config.config import QuantizationConfig as QuarkQuantConfig
from quark.torch.quantization.config.config import (
QuantizationConfig as QuarkQuantConfig)

from vllm.model_executor.layers.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING
from vllm.model_executor.layers.quantization.utils.quant_utils import (
FUSED_LAYER_NAME_MAPPING)
from vllm.model_executor.layers.quantization.quark.utils import (deep_compare,
should_ignore_layer)

Expand All @@ -33,9 +35,11 @@ class QuarkConfig(QuantizationConfig):

def __init__(self,
quant_config: Config,
kv_cache_group: List[str] = [],
kv_cache_group: Optional[List[str]] = None,
kv_cache_config: Optional[QuantizationSpec] = None,
pack_method: str = "reorder"):
if kv_cache_group is None:
kv_cache_group = []
self.quant_config = quant_config
self.kv_cache_group = kv_cache_group
self.kv_cache_config = kv_cache_config
Expand Down Expand Up @@ -71,13 +75,17 @@ def get_quant_method(
if isinstance(layer, Attention):
return QuarkKVCacheMethod(self)
if isinstance(layer, FusedMoE):
return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
return QuarkMoEMethod.get_moe_method(self, module=layer,
layer_name=prefix)
return None

@classmethod
def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
quant_config = Config.from_dict(config)
export_config = config.get("export")
if export_config is None:
raise ValueError("The export key should be included in "
"the configurations of Quark quantized model")
kv_cache_group = cast(List[str], export_config.get("kv_cache_group"))
pack_method = cast(str, export_config.get("pack_method"))

Expand All @@ -89,11 +97,13 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
layer_quant_set = set(layer_quant_names)

if not kv_cache_set.issubset(layer_quant_set):
raise ValueError("The Quark quantized model has the kv_cache_group "
"parameter setting, but no kv_cache quantization settings "
"were found in the quantization configuration.")
raise ValueError("The Quark quantized model has the "
"kv_cache_group parameter setting, "
"but no kv_cache quantization settings "
"were found in the quantization "
"configuration.")

q_configs = [quant_config.layer_quant_config[name] for name in kv_cache_group]
q_configs = [quant_config.layer_quant_config[name] for name in kv_cache_group]

Check failure on line 106 in vllm/model_executor/layers/quantization/quark/quark.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/model_executor/layers/quantization/quark/quark.py:106:81: E501 Line too long (91 > 80)
if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):

Check failure on line 107 in vllm/model_executor/layers/quantization/quark/quark.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/model_executor/layers/quantization/quark/quark.py:107:81: E501 Line too long (87 > 80)
raise ValueError("The quantization method used for kv_cache should be the same, "

Check failure on line 108 in vllm/model_executor/layers/quantization/quark/quark.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/model_executor/layers/quantization/quark/quark.py:108:81: E501 Line too long (97 > 80)
"but the quantization method for the kv_cache layer in the "

Check failure on line 109 in vllm/model_executor/layers/quantization/quark/quark.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/model_executor/layers/quantization/quark/quark.py:109:81: E501 Line too long (93 > 80)
Expand Down Expand Up @@ -179,7 +189,7 @@ def _is_static_tensor_w8a8(self,
return is_int8_dtype and is_tensor and weight_quant.symmetric and is_static

Check failure on line 189 in vllm/model_executor/layers/quantization/quark/quark.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/model_executor/layers/quantization/quark/quark.py:189:81: E501 Line too long (83 > 80)

def _find_matched_config(self,
layer_name: Optional[str],
layer_name: str,
module: torch.nn.Module) -> "QuarkQuantConfig":

proj_name = layer_name.split(".")[-1]
Expand Down Expand Up @@ -232,7 +242,7 @@ def _get_scheme_from_config(self, config: QuarkQuantConfig) -> "QuarkScheme":
def get_scheme(
self,
layer: torch.nn.Module,
layer_name: Optional[str] = None) -> "QuarkScheme":
layer_name: str) -> "QuarkScheme":

layer_quant_config = self._find_matched_config(layer_name, layer)

Expand Down
17 changes: 9 additions & 8 deletions vllm/model_executor/layers/quantization/quark/quark_moe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import enum
from enum import Enum
from typing import Callable, List, Optional

from typing import Callable, Optional

import torch
from quark.torch.quantization.config.type import QSchemeType
Expand All @@ -22,15 +21,17 @@ class QuarkMoEMethod(FusedMoEMethodBase):

@staticmethod
def get_moe_method(
quant_config: "QuarkConfig",
quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821
module: torch.nn.Module,
layer_name: str
) -> "QuarkMoEMethod":
layer_quant_config = quant_config._find_matched_config(layer_name, module)
layer_quant_config = quant_config._find_matched_config(layer_name,
module)

if layer_quant_config.output_tensors or layer_quant_config.bias:
raise NotImplementedError("Currently, Quark models with output_tensors "
"and bias quantized are not supported")
raise NotImplementedError("Currently, Quark models with "
"output_tensors and bias "
"quantized are not supported")
weight_config = layer_quant_config.weight
input_config = layer_quant_config.input_tensors

Expand All @@ -54,7 +55,7 @@ def __init__(
raise ValueError(
"For FP8 Fused MoE layers, only per-tensor scales"
"for weights and activations are supported. Found "
f"{self.weight_quant.qscheme.value}, {self.input_quant.qscheme.value}")
f"{self.weight_quant.qscheme.value}, {self.input_quant.qscheme.value}") # noqa E501

self.static_input_scales = not self.input_quant.is_dynamic

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def process_weights_after_loading(self, layer) -> None:
layer.weight_scale = Parameter(weight_scale, requires_grad=False)

else:
raise ValueError(f"Unknown quantization strategy {self.strategy}")
raise ValueError(f"Unknown quantization scheme {self.qscheme}")

# INPUT SCALE
if self.is_static_input_scheme:
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/quark/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

import re
from typing import Dict, Any, Optional, Iterable
from typing import Any, Optional, Iterable
from vllm.model_executor.layers.quantization.utils.quant_utils import (
FUSED_LAYER_NAME_MAPPING)

def deep_compare(dict1: Any, dict2: Any) -> bool:
if type(dict1) != type(dict2):
if type(dict1) is not type(dict2):
return False
if isinstance(dict1, dict):
if dict1.keys() != dict2.keys():
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/commandr.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,13 +420,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
loaded_params: Set[str] = set()
for name, loaded_weight in weights:

if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,13 +450,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
loaded_params: Set[str] = set()

for name, loaded_weight in weights:
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/exaone.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,13 +538,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
# processed with quantization, LoRA, fine-tuning, etc.
if self.config.tie_word_embeddings and "lm_head.weight" in name:
continue
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/gpt_j.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,13 +314,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
if "attn.bias" in name or "attn.masked_bias" in name:
continue

if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,13 +480,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
# processed with quantization, LoRA, fine-tuning, etc.
if self.config.tie_word_embeddings and "lm_head.weight" in name:
continue
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,13 +387,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,13 +429,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
if "rotary_emb.inv_freq" in name:
continue

if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,13 +1431,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
name = name.replace('patch_embedding.weight',
'patch_embedding._linear.weight')
loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
updated_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,13 +494,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/phimoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,13 +624,16 @@ def load_weights(self, weights: Iterable[Tuple[str,
if "rotary_emb.inv_freq" in name:
continue

if scale_names := self.quant_config.get_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_names := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
for scale_name in scale_names:
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight if loaded_weight.dim()==0 else loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim()==0
else loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
Expand Down
Loading

0 comments on commit 8837c74

Please sign in to comment.