add gptq quantization

Browse files

Files changed (3) hide show

gptq_quantization.py +332 -0
modeling_chatglm.py +18 -5
quantization.py +17 -3

gptq_quantization.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import contextlib
+import logging
+import math
+from typing import List, Optional
+import torch
+import transformers
+from torch import nn
+LOGGER = logging.getLogger(__name__)
+QUANT_LAYERS = [nn.Linear, nn.Conv2d, transformers.Conv1D]
+def is_transformer_conv1d(layer):
+    return isinstance(layer, transformers.Conv1D)
+# These two functions only work on per-channel symmetric quantization for weight
+def get_weight_scale(weight, weight_bit_width):
+    weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+    return weight_scale
+def fake_quantize_weight(weight, weight_scale):
+    weight_scale = weight_scale[:, None]
+    fake_quantized_weight = torch.round(weight / weight_scale) * weight_scale
+    return fake_quantized_weight
+class GPTQLayerWrapper:
+    def __init__(self, layer_name, layer, weight_bit_width):
+        super().__init__()
+        self.layer_name = layer_name
+        self.layer = layer
+        self.device = layer.weight.device
+        columns = layer.weight.shape[1]
+        self.columns = columns
+        self.H = torch.zeros((columns, columns), device=self.device)
+        self.nsamples = 0
+        self.is_record = True
+        self.weight_bit_width = weight_bit_width
+        self.weight_scale = None
+    def record_h(self, x):
+        if self.is_record:
+            x = x.detach().clone()
+            if len(x.shape) == 2:
+                x = x.unsqueeze(0)
+            batch = x.shape[0]
+            if isinstance(self.layer, nn.Linear) or is_transformer_conv1d(self.layer):
+                if len(x.shape) == 3:
+                    x = x.reshape((-1, x.shape[-1]))
+                x = x.t()
+            if isinstance(self.layer, nn.Conv2d):
+                unfold = nn.Unfold(
+                    self.layer.kernel_size,
+                    dilation=self.layer.dilation,
+                    padding=self.layer.padding,
+                    stride=self.layer.stride
+                )
+                x = unfold(x)
+                x = x.permute([1, 0, 2])
+                x = x.flatten(1)
+            self.H *= self.nsamples / (self.nsamples + batch)
+            self.nsamples += batch
+            x = math.sqrt(2 / self.nsamples) * x.float()
+            self.H += x.matmul(x.t())
+    def quant_weight(self, blocksize=128, percdamp=.01, groupsize=-1):
+        if groupsize != -1:
+            raise RuntimeError("Group quantization of gptq quantizer is not supported for now")
+        weight = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            weight = weight.flatten(1)
+        if is_transformer_conv1d(self.layer):
+            weight = weight.t()
+        weight = weight.float()
+        weight_scale = get_weight_scale(weight, self.weight_bit_width)
+        # todo: use buffer to store scale
+        self.weight_scale = weight_scale
+        H = self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        weight[:, dead] = 0
+        losses = torch.zeros_like(weight)
+        Q = torch.zeros_like(weight)
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.device)
+        H[diag, diag] += damp
+        try:
+            H = torch.linalg.cholesky(H)
+            H = torch.cholesky_inverse(H)
+            H = torch.linalg.cholesky(H, upper=True)
+        except Exception:
+            logging.warning(f"Warning:  cannot do compression on layer {self.layer_name} because of inverse error")
+            return
+        if H.isnan().any():
+            logging.warning(f"Warning:  cannot do compression on layer {self.layer_name} because of inverse error")
+            return
+        hinv = H
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+            w1 = weight[:, i1:i2].clone()
+            q1 = torch.zeros_like(w1)
+            total_err = torch.zeros_like(w1)
+            losses1 = torch.zeros_like(w1)
+            hinv1 = hinv[i1:i2, i1:i2]
+            for i in range(count):
+                w = w1[:, i]
+                d = hinv1[i, i]
+                q = fake_quantize_weight(w.unsqueeze(1), weight_scale).flatten()
+                q1[:, i] = q
+                losses1[:, i] = (w - q) ** 2 / d ** 2
+                err = (w - q) / d
+                w1[:, i:] -= err.unsqueeze(1).matmul(hinv1[i, i:].unsqueeze(0))
+                total_err[:, i] = err
+            Q[:, i1:i2] = q1
+            losses[:, i1:i2] = losses1 / 2
+            weight[:, i2:] -= total_err.matmul(hinv[i1:i2, i2:])
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        if is_transformer_conv1d(self.layer):
+            Q = Q.t()
+        self.layer.weight = nn.Parameter(Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype), requires_grad=False)
+        del self.H
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    def release_gpu_memory(self):
+        if hasattr(self, "H"):
+            del self.H
+class GPTQBlockWrapper:
+    def __init__(self, module_name: str, module: nn.Module, weight_bit_width=8):
+        self.layer_wrappers = {}
+        self.hook_handles = []
+        # module order in the whole network
+        self.order = 0
+        self.module_name = module_name
+        def get_hook(layer_name):
+            def record_hook(_, x):
+                self.layer_wrappers[layer_name].record_h(x[0])
+            return record_hook
+        for layer_name, layer in module.named_modules():
+            if isinstance(layer, tuple(QUANT_LAYERS)):
+                full_layer_name = f"{module_name}.{layer_name}" if layer_name else f"{module_name}"
+                self.layer_wrappers[full_layer_name] = GPTQLayerWrapper(full_layer_name, layer, weight_bit_width)
+                handle = layer.register_forward_pre_hook(get_hook(full_layer_name))
+                self.hook_handles.append(handle)
+    def quant_module(self):
+        for _, wrapper in self.layer_wrappers.items():
+            wrapper.quant_weight()
+        for h in self.hook_handles:
+            h.remove()
+    def set_order(self, idx):
+        self.order = idx
+    def get_order(self):
+        return self.order
+    def enable(self):
+        for n, l in self.layer_wrappers.items():
+            l.is_record = True
+    def disable(self):
+        for n, l in self.layer_wrappers.items():
+            l.is_record = False
+    def release_gpu_memory(self):
+        for _, wrapper in self.layer_wrappers.items():
+            wrapper.release_gpu_memory()
+class GPTQuantizer:
+    def __init__(self, block_type: Optional[List[type]] = None):
+        self.gptq_block_wrappers = {}
+        self.block_type = block_type
+    def wrap_model(self, model: nn.Module, weight_bit_width=8):
+        def wrap_block(m, prefix=""):
+            for name, child in m.named_children():
+                child_prefix = f"{prefix}.{name}" if prefix else name
+                if isinstance(child, tuple(self.block_type)):
+                    self.gptq_block_wrappers[name] = GPTQBlockWrapper(child_prefix, child, weight_bit_width)
+                    LOGGER.debug(f"Calibrate module {child_prefix} as a whole block in GPTQ")
+                else:
+                    wrap_block(child, child_prefix)
+        wrap_block(model)
+        return model
+    def quantize(self, model: nn.Module):
+        for _, module_wrapper in self.gptq_block_wrappers.items():
+            module_wrapper.quant_module()
+        return model
+    @property
+    def calibration_iters(self):
+        return len(self.gptq_block_wrappers)
+    @contextlib.contextmanager
+    def record_order(self):
+        counter = 0
+        record_handles = []
+        orders = {}
+        try:
+            def get_record_order_hook(module_name):
+                def record_hook(*args, **kwargs):
+                    nonlocal counter
+                    if module_name not in orders:
+                        orders[module_name] = counter
+                        counter += 1
+                return record_hook
+            for module_name, module_wrapper in self.gptq_block_wrappers.items():
+                # disable the record
+                for _, layer_wrapper in module_wrapper.layer_wrappers.items():
+                    layer_wrapper.is_record = False
+                one_layer_wrapper_in_module = list(module_wrapper.layer_wrappers.values())[0]
+                handles = one_layer_wrapper_in_module.layer.register_forward_pre_hook(get_record_order_hook(module_name))
+                record_handles.append(handles)
+            yield
+        except Exception as e:
+            logging.warning(e)
+        finally:
+            for module_name, order in orders.items():
+                self.gptq_block_wrappers[module_name].set_order(order)
+            for h in record_handles:
+                h.remove()
+            for module_name, module_wrapper in self.gptq_block_wrappers.items():
+                # disable the record
+                for _, layer_wrapper in module_wrapper.layer_wrappers.items():
+                    layer_wrapper.is_record = True
+    @contextlib.contextmanager
+    def start_calib_iter(self, i):
+        assert i < len(self.gptq_block_wrappers)
+        target_module_wrapper = None
+        try:
+            for _, module_wrapper in self.gptq_block_wrappers.items():
+                if module_wrapper.get_order() == i:
+                    module_wrapper.enable()
+                    target_module_wrapper = module_wrapper
+                else:
+                    module_wrapper.disable()
+            yield
+        finally:
+            target_module_wrapper.quant_module()
+    def release_gpu_memory(self):
+        for block_name, block_wrapper in self.gptq_block_wrappers.items():
+            block_wrapper.release_gpu_memory()
+        torch.cuda.empty_cache()
+def locate_parent(root: nn.Module, full_path: str):
+    parent = root
+    path = full_path.split('.')
+    for p in path[:-1]:
+        parent = getattr(parent, p)
+    return parent, path[-1]
+@torch.no_grad()
+def gptq_quantize(model, tokenizer, weight_bit_width, calib_data):
+    from .modeling_chatglm import GLMBlock
+    from .quantization import QuantizedLinear
+    quantizer = GPTQuantizer([GLMBlock])
+    calib_model = quantizer.wrap_model(model, weight_bit_width)
+    with quantizer.record_order():
+        calib_model.chat(tokenizer, calib_data[0], history=[])
+    logging.info("Start doing calibration using GPTQ ")
+    for i in range(quantizer.calibration_iters):
+        logging.info(f"Process: {i + 1}/{quantizer.calibration_iters}")
+        # todo: should add early return to speed up the calibration
+        with quantizer.start_calib_iter(i):
+            for prompt in calib_data:
+                model.chat(tokenizer, prompt, history=[])
+    # replace the fp16 linear with quantized linear
+    for _, block_wrapper in quantizer.gptq_block_wrappers.items():
+        for layer_name, layer_wrapper in block_wrapper.layer_wrappers.items():
+            layer = layer_wrapper.layer
+            parent, name_in_parent = locate_parent(model, layer_name)
+            quantized_layer = QuantizedLinear(
+                weight_bit_width=weight_bit_width,
+                weight_tensor=layer.weight,
+                bias_tensor=layer.bias,
+                weight_scale=layer_wrapper.weight_scale,
+                in_features=layer.in_features,
+                out_features=layer.out_features,
+                bias=True,
+                dtype=torch.half,
+                device=layer_wrapper.device,
+                empty_init=False
+            )
+            parent.add_module(name_in_parent, quantized_layer)
+    torch.cuda.empty_cache()
+    return

modeling_chatglm.py CHANGED Viewed

@@ -1408,12 +1408,14 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
                 break
             yield input_ids
-    def quantize(self, bits: int, empty_init=False, **kwargs):
         if bits == 0:
             return
-        from .quantization import quantize
         if self.quantized:
             logger.info("Already quantized.")
             return self
@@ -1421,6 +1423,17 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         self.quantized = True
         self.config.quantization_bit = bits
-        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
         return self

                 break
             yield input_ids
+    def quantize(
+            self, bits: int, empty_init=False, quant_algo_type: str="min_max",
+            calib_data: Optional[List[str]]=None, tokenizer=None, **kwargs):
         if bits == 0:
             return
+        from .quantization import quantize, QuantAlgoType
+        from .gptq_quantization import gptq_quantize
         if self.quantized:
             logger.info("Already quantized.")
             return self
         self.quantized = True
         self.config.quantization_bit = bits
+        quant_algo_type = QuantAlgoType(quant_algo_type)
+        if quant_algo_type == QuantAlgoType.min_max:
+            self.transformer = quantize(
+                self.transformer, bits, empty_init=empty_init, algo_type=quant_algo_type, calib_data=calib_data, tokenizer=tokenizer, **kwargs)
+        elif quant_algo_type == QuantAlgoType.gptq:
+            if calib_data is None or tokenizer is None:
+                raise RuntimeError("If using gptq to quantize the model, "
+                                   "calibration data (e.g. some string prompts) and tokenizer should be provided")
+            gptq_quantize(
+                self, tokenizer, bits, calib_data
+            )
+        else:
+            raise RuntimeError("Unsupported quantization algorithm type")
         return self

quantization.py CHANGED Viewed

@@ -8,7 +8,7 @@ import ctypes
 from transformers.utils import logging
 from typing import List
-from functools import partial
 logger = logging.get_logger(__name__)
@@ -41,6 +41,17 @@ except Exception as exception:
     logger.warning("Failed to load cpm_kernels:" + str(exception))
 class W8A16Linear(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
@@ -118,7 +129,7 @@ def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, sourc
 class QuantizedLinear(Linear):
-    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, empty_init=False, *args, **kwargs):
         super(QuantizedLinear, self).__init__(*args, **kwargs)
         self.weight_bit_width = weight_bit_width
@@ -131,7 +142,10 @@ class QuantizedLinear(Linear):
             )
             self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
         else:
-            self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
             self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
             if weight_bit_width == 4:
                 self.weight = compress_int4_weight(self.weight)

 from transformers.utils import logging
 from typing import List
+from enum import Enum
 logger = logging.get_logger(__name__)
     logger.warning("Failed to load cpm_kernels:" + str(exception))
+class QuantAlgoType(Enum):
+    min_max = 'min_max'
+    gptq = 'gptq'
+    @classmethod
+    def _missing_(cls, value):
+        supported_types = [e.value for e in cls]
+        raise ValueError(f"Unsupported quantization algorithm type. Support list: "
+                         f"{supported_types}. Got: '{value}'")
 class W8A16Linear(torch.autograd.Function):
     @staticmethod
     def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
 class QuantizedLinear(Linear):
+    def __init__(self, weight_bit_width: int, weight_tensor=None, bias_tensor=None, weight_scale=None, empty_init=False, *args, **kwargs):
         super(QuantizedLinear, self).__init__(*args, **kwargs)
         self.weight_bit_width = weight_bit_width
             )
             self.weight_scale = torch.empty(shape[0], dtype=kwargs["dtype"], device=kwargs["device"])
         else:
+            if weight_scale is None:
+                self.weight_scale = (weight_tensor.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
+            else:
+                self.weight_scale = weight_scale
             self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
             if weight_bit_width == 4:
                 self.weight = compress_int4_weight(self.weight)