Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Add LSQ quantizer #3503

Merged
merged 18 commits into from
May 18, 2021
74 changes: 61 additions & 13 deletions examples/model_compress/quantization/LSQ_torch_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import torch.nn.functional as F
from torchvision import datasets, transforms
from nni.algorithms.compression.pytorch.quantization import LsqQuantizer
from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT


class Mnist(torch.nn.Module):
Expand All @@ -14,12 +15,14 @@ def __init__(self):
self.relu1 = torch.nn.ReLU6()
self.relu2 = torch.nn.ReLU6()
self.relu3 = torch.nn.ReLU6()
self.max_pool1 = torch.nn.MaxPool2d(2, 2)
self.max_pool2 = torch.nn.MaxPool2d(2, 2)

def forward(self, x):
x = self.relu1(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = self.max_pool1(x)
x = self.relu2(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = self.max_pool2(x)
x = x.view(-1, 4 * 4 * 50)
x = self.relu3(self.fc1(x))
x = self.fc2(x)
Expand All @@ -38,6 +41,7 @@ def train(model, quantizer, device, train_loader, optimizer):
if batch_idx % 100 == 0:
print('{:2.0f}% Loss {}'.format(100 * batch_idx / len(train_loader), loss.item()))


def test(model, device, test_loader):
model.eval()
test_loss = 0
Expand All @@ -54,6 +58,24 @@ def test(model, device, test_loader):
print('Loss: {} Accuracy: {}%)\n'.format(
test_loss, 100 * correct / len(test_loader.dataset)))


def test_trt(engine, test_loader):
test_loss = 0
correct = 0
time_elasped = 0
for data, target in test_loader:
output, time = engine.inference(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
time_elasped += time
test_loss /= len(test_loader.dataset)

print('Loss: {} Accuracy: {}%'.format(
test_loss, 100 * correct / len(test_loader.dataset)))
print("Inference elapsed_time (whole dataset): {}s".format(time_elasped))


def main():
torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -71,17 +93,27 @@ def main():
DoReFaQuantizer(configure_list).compress(model)
'''
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comment can be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

configure_list = [{
'quant_types': ['weight'],
'quant_bits': {
'weight': 8,
}, # you can just use `int` here because all `quan_types` share same bits length, see config for `ReLu6` below.
'op_types':['Conv2d', 'Linear']
}, {
'quant_types': ['output'],
'quant_bits': 8,
'quant_start_step': 1000,
'op_types':['ReLU6']
}]
'quant_types': ['weight', 'input'],
'quant_bits': {'weight': 8, 'input': 8},
'op_names': ['conv1']
}, {
'quant_types': ['output'],
'quant_bits': {'output': 8, },
'op_names': ['relu1']
}, {
'quant_types': ['weight', 'input'],
'quant_bits': {'weight': 8, 'input': 8},
'op_names': ['conv2']
}, {
'quant_types': ['output'],
'quant_bits': {'output': 8},
'op_names': ['relu2']
}, {
'quant_types': ['output'],
'quant_bits': {'output': 8},
'op_names': ['max_pool2']
}
]
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = LsqQuantizer(model, configure_list, optimizer)
quantizer.compress()
Expand All @@ -92,6 +124,22 @@ def main():
train(model, quantizer, device, train_loader, optimizer)
test(model, device, test_loader)

model_path = "mnist_model.pth"
calibration_path = "mnist_calibration.pth"
calibration_config = quantizer.export_model(model_path, calibration_path)

test(model, device, test_loader)

print("calibration_config: ", calibration_config)

batch_size = 32
input_shape = (batch_size, 1, 28, 28)

engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=batch_size)
engine.compress()

test_trt(engine, test_loader)


if __name__ == '__main__':
main()
80 changes: 55 additions & 25 deletions nni/algorithms/compression/pytorch/quantization/quantizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,28 +593,43 @@ def __init__(self, model, config_list, optimizer=None):
modules_to_compress = self.get_modules_to_compress()
self.bound_model.register_buffer("steps", torch.Tensor([1]))
for layer, config in modules_to_compress:
layer.module.register_parameter("scale", torch.nn.Parameter(torch.Tensor([1.0])))
if "weight" in config.get("quant_types", []):
# todo: support per-channel quantization for weight since TensorRT it for conv weight
layer.module.register_parameter("weight_scale", torch.nn.Parameter(torch.Tensor([1.0])))
# todo: support per-channel quantization for weight since TensorRT use it for conv weight
q_bit = get_bits_length(config, "weight")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In current implementation, we only support single bit quantization in LsqQuantizer? Can we support mixed precision right now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that mixed precision of quantization is supported in this implementation since each layer has its own q_bit. We can achieve mixed quantization through some specific settings in config_list like:

configure_list = [{
        'quant_types': ['weight'],
        'quant_bits': 8,
        'op_types': ['Conv2d'],
        'op_names': ['features.3']
    }, {
        'quant_types': ['weight'],
        'quant_bits': 7,
        'op_types': ['Conv2d'],
        'op_names': ['features.6']
    }]

layer.module.register_buffer('weight_bit', torch.Tensor([q_bit]))
qmax = 2 ** (q_bit - 1) - 1
qmin = -2 ** (q_bit - 1)
init_weight_scale = layer.module.weight.data.detach().abs().mean() * 2 / (qmax ** 0.5)
layer.module.scale = torch.nn.Parameter(init_weight_scale)
layer.module.weight_scale = torch.nn.Parameter(init_weight_scale)
layer.module.weight_qmax = qmax
layer.module.weight_qmin = qmin

self.optimizer.add_param_group({"params": layer.module.weight_scale})

if "output" in config.get("quant_types", []):
# scale of activation will be initialized using the first batch data
layer.module.register_parameter("output_scale", torch.nn.Parameter(torch.Tensor([1.0])))
q_bit = get_bits_length(config, "output")
layer.module.register_buffer('activation_bit', torch.Tensor([q_bit]))
layer.module.register_buffer('output_bit', torch.Tensor([q_bit]))
qmax = 2 ** (q_bit - 1) - 1
qmin = -2 ** (q_bit - 1)
layer.module.output_qmax = qmax
layer.module.output_qmin = qmin

self.optimizer.add_param_group({"params": layer.module.output_scale})

if "input" in config.get("quant_types", []):
# scale of activation will be initialized using the first batch data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

activation -> input

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

layer.module.register_parameter("input_scale", torch.nn.Parameter(torch.Tensor([1.0])))
q_bit = get_bits_length(config, "input")
layer.module.register_buffer('input_bit', torch.Tensor([q_bit]))
qmax = 2 ** (q_bit - 1) - 1
qmin = -2 ** (q_bit - 1)
layer.module.activation_qmax = qmax
layer.module.activation_qmin = qmin
# add scale to optimizer since they are updated through the gradient
self.optimizer.add_param_group({"params": layer.module.scale})
layer.module.input_qmax = qmax
layer.module.input_qmin = qmin

self.optimizer.add_param_group({"params": layer.module.input_scale})

@staticmethod
def grad_scale(x, scale):
Expand Down Expand Up @@ -649,7 +664,7 @@ def quantize_weight(self, wrapper, **kwargs):
# todo: add support for quantize bias. If we use TensorRT as backend, there is no need to quantize
# bias
old_weight = module.old_weight
weight = self.quantize(old_weight, module.scale, module.weight_qmin, module.weight_qmax)
weight = self.quantize(old_weight, module.weight_scale, module.weight_qmin, module.weight_qmax)
module.weight = weight
return weight

Expand All @@ -658,13 +673,28 @@ def quantize_output(self, output, wrapper, **kwargs):

# initialize the scale
if self.bound_model.steps == 1:
qmax = module.activation_qmax
qmax = module.output_qmax
init_oup_scale = output.data.detach().abs().mean() * 2 / (qmax ** 0.5)
module.scale.data = init_oup_scale
module.output_scale.data = init_oup_scale

output = self.quantize(output, module.scale, module.activation_qmin, module.activation_qmax)
output = self.quantize(output, module.output_scale, module.output_qmin, module.output_qmax)
return output
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this quantization algorithm support exporting model and related quantization parameters? If yes, maybe we can consider adding function export_model() based on what parameters should export to inference framework like TensorRT.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will check it out


def quantize_input(self, *inputs, wrapper, **kwargs):
# This is hacky since it is not recommended to modify a tuple
# NB: support layers with multi inputs
module = wrapper.module
# initialize the scale
if self.bound_model.steps == 1:
qmax = module.input_qmax
init_oup_scale = inputs[0].data.detach().abs().mean() * 2 / (qmax ** 0.5)
module.input_scale.data = init_oup_scale

new_input = self.quantize(inputs[0], module.input_scale, module.input_qmin, module.input_qmax)
list_inp = list(inputs)
list_inp[0] = new_input
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why we only quantize the first input

Copy link
Contributor Author

@chenbohua3 chenbohua3 May 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that currently the quantization framework only supports layers with single input (see here, so is the trt backend, see here ). So current implementation does not support layers with multi inputs. It may be a better choice to modify the lsq quantizer to support layers with multi inputs after the framework supports it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it, it is reasonable

return tuple(list_inp)

def export_model(self, model_path, calibration_path=None, onnx_path=None, input_shape=None, device=None):
"""
Export quantized model weights and calibration parameters(optional)
Expand Down Expand Up @@ -692,18 +722,18 @@ def export_model(self, model_path, calibration_path=None, onnx_path=None, input_
calibration_config = {}

for name, module in self.bound_model.named_modules():
if hasattr(module, 'weight_bit') or hasattr(module, 'activation_bit'):
if hasattr(module, 'input_bit') or hasattr(module, 'output_bit'):
calibration_config[name] = {}
if hasattr(module, 'weight_bit'):
calibration_config[name]['weight_bit'] = int(module.weight_bit)
abs_max_weight = float(module.scale * module.weight_qmax)
calibration_config[name]['tracked_min_input'] = -abs_max_weight
calibration_config[name]['tracked_max_input'] = abs_max_weight
if hasattr(module, 'activation_bit'):
calibration_config[name]['activation_bit'] = int(module.activation_bit)
abs_max_activation = float(module.scale * module.activation_qmax)
calibration_config[name]['tracked_min_activation'] = -abs_max_activation
calibration_config[name]['tracked_max_activation'] = abs_max_activation
if hasattr(module, 'input_bit'):
calibration_config[name]['weight_bit'] = int(module.input_bit)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why assigning calibration_config[name]['weight_bit'] with module.input_bit instead of module.weight_bit. If 'weight_bit' is not equal to input_bit when setting the config, the export result will be incorrect.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to here, weight_bit is used to determine whether set input tensor's dynamic ranges or not, which I think may be not appropriate. Assigning input_bit to weight_bit here is just to be consistent with it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, we choose to record range of input tensor during the process of quantizing weight in the algorithm QAT. The reason why we handle it in this way is the requirement of integration with TensorRT which needs input tensor's dynamic range when setting layer precision to 8bit. So we record input dynamic range as here.
And if we want to export LSQ model to TensorRT, input dynamic range should also be set in most situations and input_bit should be the same as weight_bit.
However, it is still strange not to set calibration_config[name]['weight_bit] with weight_bit since we already have the value of weight_bit ==.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. How about changing the codes like:

 if hasattr(module, 'weight_bit'):
     calibration_config[name]['weight_bit'] = int(module.weight_bit)
     abs_max_input = float(module.input_scale * module.input_qmax)
     calibration_config[name]['tracked_min_input'] = -abs_max_input
     calibration_config[name]['tracked_max_input'] = abs_max_input

abs_max_input = float(module.input_scale * module.input_qmax)
calibration_config[name]['tracked_min_input'] = -abs_max_input
calibration_config[name]['tracked_max_input'] = abs_max_input
if hasattr(module, 'output_bit'):
calibration_config[name]['activation_bit'] = int(module.output_bit)
abs_max_output = float(module.output_scale * module.output_qmax)
calibration_config[name]['tracked_min_activation'] = -abs_max_output
calibration_config[name]['tracked_max_activation'] = abs_max_output
self._del_simulated_attr(module)

self.export_model_save(self.bound_model, model_path, calibration_config, calibration_path, onnx_path,
Expand All @@ -715,8 +745,8 @@ def _del_simulated_attr(self, module):
"""
delete redundant parameters in quantize module
"""
del_attr_list = ['old_weight', 'ema_decay', 'tracked_min_activation', 'tracked_max_activation', 'tracked_min_input', \
'tracked_max_input', 'scale', 'zero_point', 'weight_bit', 'activation_bit']
del_attr_list = ['old_weight', 'tracked_min_input', 'tracked_max_input', 'tracked_min_activation', \
'tracked_max_activation', 'output_scale', 'input_scale', 'weight_scale','weight_bit', 'output_bit', 'input_bit']
for attr in del_attr_list:
if hasattr(module, attr):
delattr(module, attr)
Expand Down
2 changes: 1 addition & 1 deletion nni/compression/pytorch/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ def _check_weight(module):

def quantize_helper(tensor, quant_type, wrapper, input_tensor=None, **kwargs):
if quant_type == QuantType.QUANT_INPUT:
output = wrapper.quantizer.quantize_input(tensor, wrapper, **kwargs)
output = wrapper.quantizer.quantize_input(*tensor, wrapper=wrapper, **kwargs)
elif quant_type == QuantType.QUANT_WEIGHT:
output = wrapper.quantizer.quantize_weight(wrapper, input_tensor=input_tensor, **kwargs)
elif quant_type == QuantType.QUANT_OUTPUT:
Expand Down