From cf453360dd3e09394657172f8e8d8da23cfbf043 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 16 Jan 2025 17:11:45 -0800 Subject: [PATCH] Relax dtype requirements for int4 and float8 quants in autoquant (#1571) * Relax dtype requirements for int4 quants in autoquant Summary: Some of the int4 quant only works with bfloat16/float16, previously we require the model to be in correct dtype to apply these in autoquant, this PR relaxes the constraints by converting weight and activation to compatible dtypes Test Plan: python test/integration/test_integration.py -k test_autoquant_int4wo Reviewers: Subscribers: Tasks: Tags: * remove prints * add float8 * run pre-commit * run pre-commit * manual format * enable bias=True test * remove print --- test/integration/test_integration.py | 125 ++++++++++++---- torchao/dtypes/uintx/marlin_sparse_layout.py | 5 + torchao/quantization/autoquant.py | 146 +++++++++++++------ 3 files changed, 207 insertions(+), 69 deletions(-) diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index bcd8af7ad3..1087db8cf8 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -25,6 +25,9 @@ AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, AQFloat8WeightOnlyQuantizedLinearWeight, + AQGemliteInt4G64WeightOnlyQuantizedLinearWeight, + AQInt4G32WeightOnlyQuantizedLinearWeight, + AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeight, AQInt8DynamicallyQuantizedLinearWeight, AQInt8WeightOnlyQuantizedLinearWeight, AQInt8WeightOnlyQuantizedLinearWeight2, @@ -1751,37 +1754,109 @@ def test_autoquant_min_sqnr(self, device, dtype): @unittest.skipIf( not TORCH_VERSION_AT_LEAST_2_4, "autoquant float option requires 2.4+." ) - def test_autoquant_float(self): + def test_autoquant_hp_float(self): device = "cuda" dtype = torch.float32 m, k, n = 128, 128, 128 example_input = torch.randn(m, k, device=device, dtype=dtype) - model = ( - torch.nn.Sequential( - torch.nn.ReLU(), - torch.nn.Linear(k, n), - torch.nn.ReLU(), + for qclass in torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST: + model = ( + torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Linear(k, n, bias=True), + torch.nn.ReLU(), + ) + .to(device) + .to(dtype) ) - .to(device) - .to(dtype) - ) - ref = model(example_input) - torchao.autoquant( - model, - qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, - ) - out = model(example_input) - from torchao.quantization.autoquant import ( - BFloat16Tensor, - Float16Tensor, - Float32Tensor, - ) + ref = model(example_input) + qtensor_class_list = [qclass] + torchao.autoquant( + model, + qtensor_class_list=qtensor_class_list, + ) + out = model(example_input) + self.assertIn( + type(model[1].weight), + qtensor_class_list, + ) + self.assertGreater(compute_error(out, ref), 40) - self.assertIn( - type(model[1].weight), [Float32Tensor, Float16Tensor, BFloat16Tensor] - ) - print(compute_error(out, ref)) - self.assertGreater(compute_error(out, ref), 60) + @parameterized.expand(COMMON_DEVICE_DTYPE) + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf( + not TORCH_VERSION_AT_LEAST_2_5, "autoquant int4 option requires 2.5+." + ) + @unittest.skipIf(not has_gemlite, "gemlite not available") + def test_autoquant_int4wo(self, device, dtype): + if device == "cpu": + self.skipTest(f"int4wo is for cuda, not {device}") + + m, k, n = 128, 128, 128 + example_input = torch.randn(m, k, device=device, dtype=dtype) + + for qclass in [ + AQGemliteInt4G64WeightOnlyQuantizedLinearWeight, + AQInt4G32WeightOnlyQuantizedLinearWeight, + AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeight, + ]: + model = ( + torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Linear(k, n, bias=True), + torch.nn.ReLU(), + ) + .to(device) + .to(dtype) + ) + ref = model(example_input) + qtensor_class_list = [qclass] + torchao.autoquant( + model, + qtensor_class_list=qtensor_class_list, + ) + out = model(example_input) + + self.assertIn(type(model[1].weight), qtensor_class_list) + self.assertGreater(compute_error(ref, out), 20) + + @parameterized.expand(COMMON_DEVICE_DTYPE) + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf( + not TORCH_VERSION_AT_LEAST_2_5, "autoquant int4 option requires 2.5+." + ) + def test_autoquant_float8(self, device, dtype): + if device == "cpu": + self.skipTest(f"int4wo is for cuda, not {device}") + + # note: marlin sparse layout failed when scale_t has a dimension of 1d + m, k, n = 128, 128, 128 + example_input = torch.randn(m, k, device=device, dtype=dtype) + + for qclass in [ + AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, + AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, + AQFloat8WeightOnlyQuantizedLinearWeight, + ]: + model = ( + torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Linear(k, n, bias=True), + torch.nn.ReLU(), + ) + .to(device) + .to(dtype) + ) + ref = model(example_input) + qtensor_class_list = [qclass] + torchao.autoquant( + model, + qtensor_class_list=qtensor_class_list, + ) + out = model(example_input) + + self.assertIn(type(model[1].weight), qtensor_class_list) + self.assertGreater(compute_error(ref, out), 20) @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "requires 2.5+.") diff --git a/torchao/dtypes/uintx/marlin_sparse_layout.py b/torchao/dtypes/uintx/marlin_sparse_layout.py index e37623182a..2a84dd1813 100644 --- a/torchao/dtypes/uintx/marlin_sparse_layout.py +++ b/torchao/dtypes/uintx/marlin_sparse_layout.py @@ -227,6 +227,11 @@ def from_plain( # Linear layers are (in_features, out_features) but the int_data that is reaching this point # is (out_features, in_features). We need to transpose it to match the expected shape in the marlin code. q_w_24 = int_data.t() + # addressing the case when scale has dimension 1, happens when + # weight_shape[-1] == group_size == 128 + if scale.ndim == 1: + scale = scale.reshape(scale.shape[0], -1) + scale_t = scale.t() if not torch.cuda.get_device_capability()[0] >= 8: diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py index d506d2b65e..d49e84e066 100644 --- a/torchao/quantization/autoquant.py +++ b/torchao/quantization/autoquant.py @@ -16,6 +16,7 @@ from torchao.kernel import safe_int_mm from torchao.quantization.linear_activation_quantized_tensor import ( LinearActivationQuantizedTensor, + to_linear_activation_quantized, ) from torchao.quantization.quant_primitives import ( MappingType, @@ -370,6 +371,18 @@ def _is_interpolate_mode(mode): return False +def _to_float16(x: torch.Tensor) -> torch.Tensor: + return x.to(torch.float16) + + +def _to_bfloat16(x: torch.Tensor) -> torch.Tensor: + return x.to(torch.bfloat16) + + +def _identity(x: torch.Tensor) -> torch.Tensor: + return x + + class AQMixin: """ Tests and benchmarks the autoquantization process for the given activation matrix, weight, and bias. @@ -610,9 +623,11 @@ def _quantized_linear_op(act_mat, w_qtensor, bias): return y -class AQInt4G32WeightOnlyQuantizedLinearWeight(AffineQuantizedTensor, AQMixin): +class AQInt4G32WeightOnlyQuantizedLinearWeight( + LinearActivationQuantizedTensor, AQMixin +): """ - AutoQuantizable version of Int4WeightOnlyQuantizedLinearWeight + AutoQuantizable version of int4_weight_only """ group_size: int = 32 @@ -621,20 +636,30 @@ class AQInt4G32WeightOnlyQuantizedLinearWeight(AffineQuantizedTensor, AQMixin): @classmethod def from_float(cls, weight): + from torchao.dtypes import to_affine_quantized_intx + group_size = cls.group_size _layout = cls.aq_layout if weight.shape[-1] % group_size != 0: return weight + input_quant_func = None + + # NOTE: we only convert activation dtype and weight dtype here + # because the kernel implementation for both TensorCoreTiledLayout and MarlinSparseLayout + # can work with multiple bias dtypes (by converting bias to the dtype of activation) if ( isinstance(_layout, TensorCoreTiledLayout) and weight.dtype != torch.bfloat16 ): - return weight - - if isinstance(_layout, MarlinSparseLayout) and weight.dtype != torch.float16: - return weight + weight = weight.to(torch.bfloat16) + input_quant_func = _to_bfloat16 + elif isinstance(_layout, MarlinSparseLayout) and weight.dtype != torch.float16: + weight = weight.to(torch.float16) + input_quant_func = _to_float16 + else: + input_quant_func = _identity use_hqq = True mapping_type = MappingType.ASYMMETRIC @@ -653,7 +678,7 @@ def from_float(cls, weight): zero_point_domain = ZeroPointDomain.INT use_hqq = False - return super(AQInt4G32WeightOnlyQuantizedLinearWeight, cls).from_hp_to_intx( + weight = to_affine_quantized_intx( weight, mapping_type, block_size, @@ -668,6 +693,10 @@ def from_float(cls, weight): use_hqq=use_hqq, ) + return super(AQInt4G32WeightOnlyQuantizedLinearWeight, cls).from_float( + weight, input_quant_func + ) + class AQInt4G64WeightOnlyQuantizedLinearWeight( AQInt4G32WeightOnlyQuantizedLinearWeight @@ -694,16 +723,19 @@ class AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeight( aq_layout: Layout = MarlinSparseLayout() -class AQGemliteInt4G32WeightOnlyQuantizedLinearWeight(AffineQuantizedTensor, AQMixin): +class AQGemliteInt4G32WeightOnlyQuantizedLinearWeight( + LinearActivationQuantizedTensor, AQMixin +): group_size: int = 32 @classmethod def from_float(cls, weight): - if weight.dtype != torch.float16: - return weight - + from torchao.dtypes import to_affine_quantized_intx from torchao.dtypes.uintx.gemlite_layout import get_gemlite_aqt_kwargs + if weight.dtype != torch.float16: + weight = weight.to(torch.float16) + bit_width = 4 packing_bitwidth = 32 contiguous = None @@ -711,9 +743,12 @@ def from_float(cls, weight): aqt_kwargs = get_gemlite_aqt_kwargs( weight, cls.group_size, bit_width, packing_bitwidth, contiguous, use_hqq ) - return super( - AQGemliteInt4G32WeightOnlyQuantizedLinearWeight, cls - ).from_hp_to_intx(weight, **aqt_kwargs) + weight = to_affine_quantized_intx(weight, **aqt_kwargs) + input_quant_func = _to_float16 + + return super(AQGemliteInt4G32WeightOnlyQuantizedLinearWeight, cls).from_float( + weight, input_quant_func + ) class AQGemliteInt4G64WeightOnlyQuantizedLinearWeight( @@ -755,11 +790,24 @@ def from_float(cls, weight): return weight +# TODO: remove skip_weight_conversion arg class Float32Tensor(TorchAOBaseTensor): """Tensor subclass tensor for fp32 dtype""" - def __init__(self, weight): - self.weight = weight.to(torch.float32) + @staticmethod + def __new__(cls, weight, skip_weight_conversion=False): + kwargs = {} + kwargs["device"] = weight.device + kwargs["layout"] = ( + kwargs.get("layout") if kwargs.get("layout", False) else weight.layout + ) + kwargs["dtype"] = weight.dtype + kwargs["requires_grad"] = False + shape = weight.shape + return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs) + + def __init__(self, weight, skip_weight_conversion=False): + self.weight = weight if skip_weight_conversion else weight.to(torch.float32) @staticmethod def _quantized_linear_op(act_mat, w_qtensor, bias): @@ -778,7 +826,7 @@ def _apply_fn_to_data(self, fn): @classmethod def from_float(cls, weight): - return Float32Tensor(weight) + return cls(weight) @Float32Tensor.implements([torch.nn.functional.linear, aten.linear.default]) @@ -816,8 +864,8 @@ def _(func, types, args, kwargs): class BFloat16Tensor(Float32Tensor): - def __init__(self, weight): - self.weight = weight.to(torch.bfloat16) + def __init__(self, weight, skip_weight_conversion=False): + self.weight = weight if skip_weight_conversion else weight.to(torch.bfloat16) @staticmethod def _quantized_linear_op(act_mat, w_qtensor, bias): @@ -830,13 +878,13 @@ def _quantized_linear_op(act_mat, w_qtensor, bias): ).to(dtype=orig_dtype) @classmethod - def from_float(cls, weight): - return BFloat16Tensor(weight) + def from_float(cls, weight, skip_weight_conversion=False): + return cls(weight, skip_weight_conversion) class Float16Tensor(Float32Tensor): - def __init__(self, weight): - self.weight = weight.to(torch.float16) + def __init__(self, weight, skip_weight_conversion=False): + self.weight = weight if skip_weight_conversion else weight.to(torch.float16) @staticmethod def _quantized_linear_op(act_mat, w_qtensor, bias): @@ -849,8 +897,8 @@ def _quantized_linear_op(act_mat, w_qtensor, bias): ).to(dtype=orig_dtype) @classmethod - def from_float(cls, weight): - return Float16Tensor(weight) + def from_float(cls, weight, skip_weight_conversion=False): + return cls(weight, skip_weight_conversion) class AQFloat32LinearWeight(Float32Tensor, AQMixin): @@ -911,9 +959,7 @@ def from_float(cls, weight): ) -class AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight( - AQMixin, LinearActivationQuantizedTensor -): +class AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight(AQMixin, BFloat16Tensor): """ AutoQuantizable version of Float8DynamicallyQuantizedLinearWeight using per row scaling """ @@ -942,12 +988,13 @@ def get_per_token_block_size(x): input_target_dtype = torch.float8_e4m3fn _layout = Float8Layout(mm_config=Float8MMConfig(use_fast_accum=True)) # TODO: make this serializable - input_quant_func = lambda x: _input_activation_quant_func_fp8( - x=x, - activation_granularity=cls.activation_granularity, - activation_dtype=input_target_dtype, - ) + input_quant_func = _input_activation_quant_func_fp8 + input_quant_kwargs = { + "activation_granularity": cls.activation_granularity, + "activation_dtype": input_target_dtype, + } block_size = get_weight_block_size(weight) + weight = to_affine_quantized_floatx( input_float=weight, block_size=block_size, @@ -955,10 +1002,15 @@ def get_per_token_block_size(x): _layout=_layout, scale_dtype=torch.float32, ) - weight = super( + weight = to_linear_activation_quantized( + weight, input_quant_func, quant_kwargs=input_quant_kwargs + ) + # at inference time, + # we first convert the input, weight and bias to bfloat16, and then quantize activation + # and then dispatch to the quantized ops + return super( AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, cls - ).from_float(weight, input_quant_func) - return weight + ).from_float(weight, skip_weight_conversion=True) class AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight( @@ -982,15 +1034,14 @@ def get_weight_block_size(x): return x.shape target_dtype = torch.float8_e4m3fn - input_target_dtype = torch.float8_e4m3fn _layout = Float8Layout(mm_config=Float8MMConfig(use_fast_accum=True)) - # TODO: make this serializable - input_quant_func = lambda x: _input_activation_quant_func_fp8( - x=x, - activation_granularity=cls.activation_granularity, - activation_dtype=input_target_dtype, - ) + # TODO: test serializable + input_quant_func = _input_activation_quant_func_fp8 + input_quant_args = { + "activation_granularity": cls.activation_granularity, + "activation_dtype": input_target_dtype, + } block_size = get_weight_block_size(weight) weight = to_affine_quantized_floatx( input_float=weight, @@ -1001,7 +1052,7 @@ def get_weight_block_size(x): ) weight = super( AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls - ).from_float(weight, input_quant_func) + ).from_float(weight, input_quant_func, input_quant_args) return weight @@ -1299,3 +1350,10 @@ def finalize_autoquant(): if TORCH_VERSION_AT_LEAST_2_5: torch.serialization.add_safe_globals(ALL_AUTOQUANT_CLASS_LIST) + torch.serialization.add_safe_globals( + [ + _to_float16, + _to_bfloat16, + _identity, + ] + )