chatglm-6b / gptq_quantization.py
BigMaoGoGoGo's picture
add gptq quantization
8fe54fa
raw
history blame
11.7 kB
import contextlib
import logging
import math
from typing import List, Optional
import torch
import transformers
from torch import nn
LOGGER = logging.getLogger(__name__)
QUANT_LAYERS = [nn.Linear, nn.Conv2d, transformers.Conv1D]
def is_transformer_conv1d(layer):
return isinstance(layer, transformers.Conv1D)
# These two functions only work on per-channel symmetric quantization for weight
def get_weight_scale(weight, weight_bit_width):
weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
return weight_scale
def fake_quantize_weight(weight, weight_scale):
weight_scale = weight_scale[:, None]
fake_quantized_weight = torch.round(weight / weight_scale) * weight_scale
return fake_quantized_weight
class GPTQLayerWrapper:
def __init__(self, layer_name, layer, weight_bit_width):
super().__init__()
self.layer_name = layer_name
self.layer = layer
self.device = layer.weight.device
columns = layer.weight.shape[1]
self.columns = columns
self.H = torch.zeros((columns, columns), device=self.device)
self.nsamples = 0
self.is_record = True
self.weight_bit_width = weight_bit_width
self.weight_scale = None
def record_h(self, x):
if self.is_record:
x = x.detach().clone()
if len(x.shape) == 2:
x = x.unsqueeze(0)
batch = x.shape[0]
if isinstance(self.layer, nn.Linear) or is_transformer_conv1d(self.layer):
if len(x.shape) == 3:
x = x.reshape((-1, x.shape[-1]))
x = x.t()
if isinstance(self.layer, nn.Conv2d):
unfold = nn.Unfold(
self.layer.kernel_size,
dilation=self.layer.dilation,
padding=self.layer.padding,
stride=self.layer.stride
)
x = unfold(x)
x = x.permute([1, 0, 2])
x = x.flatten(1)
self.H *= self.nsamples / (self.nsamples + batch)
self.nsamples += batch
x = math.sqrt(2 / self.nsamples) * x.float()
self.H += x.matmul(x.t())
def quant_weight(self, blocksize=128, percdamp=.01, groupsize=-1):
if groupsize != -1:
raise RuntimeError("Group quantization of gptq quantizer is not supported for now")
weight = self.layer.weight.data.clone()
if isinstance(self.layer, nn.Conv2d):
weight = weight.flatten(1)
if is_transformer_conv1d(self.layer):
weight = weight.t()
weight = weight.float()
weight_scale = get_weight_scale(weight, self.weight_bit_width)
# todo: use buffer to store scale
self.weight_scale = weight_scale
H = self.H
dead = torch.diag(H) == 0
H[dead, dead] = 1
weight[:, dead] = 0
losses = torch.zeros_like(weight)
Q = torch.zeros_like(weight)
damp = percdamp * torch.mean(torch.diag(H))
diag = torch.arange(self.columns, device=self.device)
H[diag, diag] += damp
try:
H = torch.linalg.cholesky(H)
H = torch.cholesky_inverse(H)
H = torch.linalg.cholesky(H, upper=True)
except Exception:
logging.warning(f"Warning: cannot do compression on layer {self.layer_name} because of inverse error")
return
if H.isnan().any():
logging.warning(f"Warning: cannot do compression on layer {self.layer_name} because of inverse error")
return
hinv = H
for i1 in range(0, self.columns, blocksize):
i2 = min(i1 + blocksize, self.columns)
count = i2 - i1
w1 = weight[:, i1:i2].clone()
q1 = torch.zeros_like(w1)
total_err = torch.zeros_like(w1)
losses1 = torch.zeros_like(w1)
hinv1 = hinv[i1:i2, i1:i2]
for i in range(count):
w = w1[:, i]
d = hinv1[i, i]
q = fake_quantize_weight(w.unsqueeze(1), weight_scale).flatten()
q1[:, i] = q
losses1[:, i] = (w - q) ** 2 / d ** 2
err = (w - q) / d
w1[:, i:] -= err.unsqueeze(1).matmul(hinv1[i, i:].unsqueeze(0))
total_err[:, i] = err
Q[:, i1:i2] = q1
losses[:, i1:i2] = losses1 / 2
weight[:, i2:] -= total_err.matmul(hinv[i1:i2, i2:])
if torch.cuda.is_available():
torch.cuda.synchronize()
if is_transformer_conv1d(self.layer):
Q = Q.t()
self.layer.weight = nn.Parameter(Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype), requires_grad=False)
del self.H
if torch.cuda.is_available():
torch.cuda.empty_cache()
def release_gpu_memory(self):
if hasattr(self, "H"):
del self.H
class GPTQBlockWrapper:
def __init__(self, module_name: str, module: nn.Module, weight_bit_width=8):
self.layer_wrappers = {}
self.hook_handles = []
# module order in the whole network
self.order = 0
self.module_name = module_name
def get_hook(layer_name):
def record_hook(_, x):
self.layer_wrappers[layer_name].record_h(x[0])
return record_hook
for layer_name, layer in module.named_modules():
if isinstance(layer, tuple(QUANT_LAYERS)):
full_layer_name = f"{module_name}.{layer_name}" if layer_name else f"{module_name}"
self.layer_wrappers[full_layer_name] = GPTQLayerWrapper(full_layer_name, layer, weight_bit_width)
handle = layer.register_forward_pre_hook(get_hook(full_layer_name))
self.hook_handles.append(handle)
def quant_module(self):
for _, wrapper in self.layer_wrappers.items():
wrapper.quant_weight()
for h in self.hook_handles:
h.remove()
def set_order(self, idx):
self.order = idx
def get_order(self):
return self.order
def enable(self):
for n, l in self.layer_wrappers.items():
l.is_record = True
def disable(self):
for n, l in self.layer_wrappers.items():
l.is_record = False
def release_gpu_memory(self):
for _, wrapper in self.layer_wrappers.items():
wrapper.release_gpu_memory()
class GPTQuantizer:
def __init__(self, block_type: Optional[List[type]] = None):
self.gptq_block_wrappers = {}
self.block_type = block_type
def wrap_model(self, model: nn.Module, weight_bit_width=8):
def wrap_block(m, prefix=""):
for name, child in m.named_children():
child_prefix = f"{prefix}.{name}" if prefix else name
if isinstance(child, tuple(self.block_type)):
self.gptq_block_wrappers[name] = GPTQBlockWrapper(child_prefix, child, weight_bit_width)
LOGGER.debug(f"Calibrate module {child_prefix} as a whole block in GPTQ")
else:
wrap_block(child, child_prefix)
wrap_block(model)
return model
def quantize(self, model: nn.Module):
for _, module_wrapper in self.gptq_block_wrappers.items():
module_wrapper.quant_module()
return model
@property
def calibration_iters(self):
return len(self.gptq_block_wrappers)
@contextlib.contextmanager
def record_order(self):
counter = 0
record_handles = []
orders = {}
try:
def get_record_order_hook(module_name):
def record_hook(*args, **kwargs):
nonlocal counter
if module_name not in orders:
orders[module_name] = counter
counter += 1
return record_hook
for module_name, module_wrapper in self.gptq_block_wrappers.items():
# disable the record
for _, layer_wrapper in module_wrapper.layer_wrappers.items():
layer_wrapper.is_record = False
one_layer_wrapper_in_module = list(module_wrapper.layer_wrappers.values())[0]
handles = one_layer_wrapper_in_module.layer.register_forward_pre_hook(get_record_order_hook(module_name))
record_handles.append(handles)
yield
except Exception as e:
logging.warning(e)
finally:
for module_name, order in orders.items():
self.gptq_block_wrappers[module_name].set_order(order)
for h in record_handles:
h.remove()
for module_name, module_wrapper in self.gptq_block_wrappers.items():
# disable the record
for _, layer_wrapper in module_wrapper.layer_wrappers.items():
layer_wrapper.is_record = True
@contextlib.contextmanager
def start_calib_iter(self, i):
assert i < len(self.gptq_block_wrappers)
target_module_wrapper = None
try:
for _, module_wrapper in self.gptq_block_wrappers.items():
if module_wrapper.get_order() == i:
module_wrapper.enable()
target_module_wrapper = module_wrapper
else:
module_wrapper.disable()
yield
finally:
target_module_wrapper.quant_module()
def release_gpu_memory(self):
for block_name, block_wrapper in self.gptq_block_wrappers.items():
block_wrapper.release_gpu_memory()
torch.cuda.empty_cache()
def locate_parent(root: nn.Module, full_path: str):
parent = root
path = full_path.split('.')
for p in path[:-1]:
parent = getattr(parent, p)
return parent, path[-1]
@torch.no_grad()
def gptq_quantize(model, tokenizer, weight_bit_width, calib_data):
from .modeling_chatglm import GLMBlock
from .quantization import QuantizedLinear
quantizer = GPTQuantizer([GLMBlock])
calib_model = quantizer.wrap_model(model, weight_bit_width)
with quantizer.record_order():
calib_model.chat(tokenizer, calib_data[0], history=[])
logging.info("Start doing calibration using GPTQ ")
for i in range(quantizer.calibration_iters):
logging.info(f"Process: {i + 1}/{quantizer.calibration_iters}")
# todo: should add early return to speed up the calibration
with quantizer.start_calib_iter(i):
for prompt in calib_data:
model.chat(tokenizer, prompt, history=[])
# replace the fp16 linear with quantized linear
for _, block_wrapper in quantizer.gptq_block_wrappers.items():
for layer_name, layer_wrapper in block_wrapper.layer_wrappers.items():
layer = layer_wrapper.layer
parent, name_in_parent = locate_parent(model, layer_name)
quantized_layer = QuantizedLinear(
weight_bit_width=weight_bit_width,
weight_tensor=layer.weight,
bias_tensor=layer.bias,
weight_scale=layer_wrapper.weight_scale,
in_features=layer.in_features,
out_features=layer.out_features,
bias=True,
dtype=torch.half,
device=layer_wrapper.device,
empty_init=False
)
parent.add_module(name_in_parent, quantized_layer)
torch.cuda.empty_cache()
return