Skip to content

Commit

Permalink
change naming
Browse files Browse the repository at this point in the history
  • Loading branch information
SunMarc committed Dec 12, 2023
1 parent 4f770d6 commit 270b60b
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
26 changes: 15 additions & 11 deletions optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(
exllama_config: Dict[str, Any] = None,
max_input_length: Optional[int] = None,
cache_block_outputs: Optional[bool] = True,
inside_layer_modules: Optional[List[List[str]]] = None,
modules_to_quantize_inside_block: Optional[List[List[str]]] = None,
*args,
**kwargs,
):
Expand Down Expand Up @@ -107,7 +107,7 @@ def __init__(
model_seqlen (`Optional[int]`, defaults to `None`):
The maximum sequence length that the model can take.
block_name_to_quantize (`Optional[str]`, defaults to `None`):
The transformers block name to quantize.
The transformers block name to quantize. If None, we will infer the block name using common pattern (e.g. model.layers)
module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
The layers that are preceding the first Transformer block.
batch_size (`int`, defaults to `1`):
Expand All @@ -124,8 +124,9 @@ def __init__(
cache_block_outputs (`bool`, defaults to `True`):
Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
(e.g. ChatGLM) but can require more time.
inside_layer_modules (`List[List[str]]`, *optional*, defaults to `None`):
List of module names to quantize inside block_name_to_quantize. If not set, we will quantize all the linear layers.
modules_to_quantize_inside_block (`List[List[str]]`, *optional*, defaults to `None`):
List list of module names to quantize in the block specified. The block to quantize can be specified by setting
`block_name_to_quantize`. We will quantize each list sequentially.
"""

self.bits = bits
Expand All @@ -146,7 +147,7 @@ def __init__(
self.max_input_length = max_input_length
self.quant_method = QuantizationMethod.GPTQ
self.cache_block_outputs = cache_block_outputs
self.inside_layer_modules = inside_layer_modules
self.modules_to_quantize_inside_block = modules_to_quantize_inside_block

self.serialization_keys = [
"bits",
Expand All @@ -157,7 +158,7 @@ def __init__(
"sym",
"true_sequential",
"quant_method",
"inside_layer_modules",
"modules_to_quantize_inside_block",
]

if self.bits not in [2, 3, 4, 8]:
Expand Down Expand Up @@ -215,8 +216,8 @@ def convert_model(self, model: nn.Module):
self.block_name_to_quantize = get_block_name_with_pattern(model)
block_name = self.block_name_to_quantize
layers_to_be_replaced = get_layers(model, prefix=block_name)
if self.inside_layer_modules is not None:
layers_to_keep = sum(self.inside_layer_modules, [])
if self.modules_to_quantize_inside_block is not None:
layers_to_keep = sum(self.modules_to_quantize_inside_block, [])
for name in list(layers_to_be_replaced.keys()):
if not any(name.endswith(layer) for layer in layers_to_keep):
logger.info(f"{name} has not been quantized. We don't convert it")
Expand Down Expand Up @@ -454,11 +455,14 @@ def store_input_hook(_, input, *args):
if not has_device_map or get_device(block) == torch.device("cpu"):
block = block.to(0)
layers = get_layers(block)
if isinstance(self.inside_layer_modules, list) and len(self.inside_layer_modules) > 0:
if (
isinstance(self.modules_to_quantize_inside_block, list)
and len(self.modules_to_quantize_inside_block) > 0
):
if self.true_sequential:
layers_name_list = self.inside_layer_modules
layers_name_list = self.modules_to_quantize_inside_block
else:
layers_name_list = [sum(self.inside_layer_modules, [])]
layers_name_list = [sum(self.modules_to_quantize_inside_block, [])]
else:
if self.true_sequential:
# lazy sequential but works well
Expand Down
12 changes: 8 additions & 4 deletions tests/gptq/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class GPTQTest(unittest.TestCase):
disable_exllama = True
exllama_config = None
cache_block_outputs = True
inside_layer_modules = None
modules_to_quantize_inside_block = None

dataset = [
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
Expand All @@ -79,7 +79,7 @@ def setUpClass(cls):
disable_exllama=cls.disable_exllama,
exllama_config=cls.exllama_config,
cache_block_outputs=cls.cache_block_outputs,
inside_layer_modules=cls.inside_layer_modules,
modules_to_quantize_inside_block=cls.modules_to_quantize_inside_block,
)

cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
Expand Down Expand Up @@ -303,9 +303,13 @@ class GPTQTestNoBlockCaching(GPTQTest):
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")


class GPTQTestInsideLayerModules(GPTQTest):
class GPTQTestModuleQuant(GPTQTest):
# all layers are quantized apart from self_attention.dense
inside_layer_modules = [["self_attention.query_key_value"], ["mlp.dense_h_to_4h"], ["mlp.dense_4h_to_h"]]
modules_to_quantize_inside_block = [
["self_attention.query_key_value"],
["mlp.dense_h_to_4h"],
["mlp.dense_4h_to_h"],
]
EXPECTED_RELATIVE_DIFFERENCE = 1.57705236164535

def test_not_converted_layers(self):
Expand Down

0 comments on commit 270b60b

Please sign in to comment.