[WIP] Quark Quantizer Support #1207

shobrienDMA · 2025-01-29T15:56:25Z

This allows Quark Quantized models to be processed by ONNX Runtime GenAI.

Quark models must be exported in hf_format.

An example quark_quantize.py command:

python quantize_quark.py --model_dir /[Model_Path] /
--output_dir /[Output_Model_Path] /
--quant_scheme w_uint4_per_group_asym /
--num_calib_data 128 /
--quant_algo awq /
--dataset pileval_for_awq_benchmark /
--seq_len 512 /
--model_export hf_format /
--data_type float32

It also allows different group sizes for different layers depending on what is present in the config.json that Quark produces, a Quark config can look like:

...
  "quantization_config": {
    "algo_config": {
      "model_decoder_layers": "model.layers",
      "name": "awq",
      "num_attention_heads": -1,
      "num_key_value_heads": -1,
      "scaling_layers": [
        {
          "inp": "self_attn.q_proj",
          "layers": [
            "self_attn.q_proj",
            "self_attn.k_proj",
            "self_attn.v_proj"
          ],
          "module2inspect": "self_attn",
          "prev_op": "input_layernorm"
        },
        {
          "inp": "self_attn.o_proj",
          "layers": [
            "self_attn.o_proj"
          ],
          "prev_op": "self_attn.v_proj"
        },
        {
          "inp": "mlp.gate_proj",
          "layers": [
            "mlp.gate_proj",
            "mlp.up_proj"
          ],
          "module2inspect": "mlp",
          "prev_op": "post_attention_layernorm"
        },
        {
          "inp": "mlp.down_proj",
          "layers": [
            "mlp.down_proj"
          ],
          "prev_op": "mlp.up_proj"
        }
      ]
    },
    "exclude": [],
    "export": {
      "kv_cache_group": [],
      "pack_method": "reorder",
      "weight_format": "real_quantized",
      "weight_merge_groups": null
    },
    "global_quant_config": {
      "bias": null,
      "input_tensors": null,
      "output_tensors": null,
      "target_device": null,
      "weight": {
        "ch_axis": 1,
        "dtype": "uint4",
        "group_size": 128,
        "is_dynamic": false,
        "observer_cls": "PerGroupMinMaxObserver",
        "qscheme": "per_group",
        "round_method": "half_even",
        "scale_type": "float",
        "symmetric": false
      }
    },
    "layer_quant_config": {
      "lm_head": {
        "bias": null,
        "input_tensors": null,
        "output_tensors": null,
        "target_device": null,
        "weight": {
          "ch_axis": 1,
          "dtype": "uint4",
          "group_size": 32,
          "is_dynamic": false,
          "observer_cls": "PerGroupMinMaxObserver",
          "qscheme": "per_group",
          "round_method": "half_even",
          "scale_type": "float",
          "symmetric": false
        }
      }
    },
    "layer_type_quant_config": {},
    "quant_method": "quark",
    "quant_mode": "eager_mode"
  },
...

As you can see the lm_head in layer_quant_config has a different group size.

# Conflicts: # src/python/py/models/builder.py

…rks for this

…into amd_shobrien/per_layer_support

BowenBao · 2025-02-11T17:09:57Z

cc @kunal-vaishnavi this is the PR for Quark integration, please take a look and let us know what you think, thanks!

kunal-vaishnavi · 2025-02-18T21:04:49Z