amd-shark
/

sdxl-quant-int8

GiusFra commited on Jun 7, 2024

Commit

049c65f

verified ·

1 Parent(s): 742c3ad

Upload math_model.py with huggingface_hub

Files changed (1) hide show

math_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ def quantize(tensor, scale, zero_point, is_asym=False):
         clamp_min, clamp_max = torch.tensor(0.), torch.tensor(255.)
     else:
         clamp_min, clamp_max = torch.tensor(-128.), torch.tensor(127.)
-    quant_tensor = torch.clamp(torch.round(tensor/scale), clamp_min, clamp_max) + zero_point
     return quant_tensor
 def dequantize(tensor, scale, zero_point):
@@ -30,6 +30,8 @@ class QuantLinear(nn.Module):
     def forward(self, x):
         scaled_x = x * self.mul_factor
         quant_weight = quantize(self.linear.weight, self.weight_scale, self.weight_zp, is_asym=True)
         quant_input = quantize(scaled_x, self.input_scale, self.input_zp, is_asym=False)
         dequantized_weight = dequantize(quant_weight, self.weight_scale, self.weight_zp)
@@ -54,6 +56,8 @@ class QuantConv2d(nn.Module):
     def forward(self, x):
         scaled_x = x * self.mul_factor
         quant_weight = quantize(self.linear.weight, self.weight_scale, self.weight_zp, is_asym=True)
         quant_input = quantize(scaled_x, self.input_scale, self.input_zp, is_asym=False)
         dequantized_weight = dequantize(quant_weight, self.weight_scale, self.weight_zp)

         clamp_min, clamp_max = torch.tensor(0.), torch.tensor(255.)
     else:
         clamp_min, clamp_max = torch.tensor(-128.), torch.tensor(127.)
+    quant_tensor = torch.clamp(torch.round(tensor/scale + zero_point), clamp_min, clamp_max)
     return quant_tensor
 def dequantize(tensor, scale, zero_point):
     def forward(self, x):
         scaled_x = x * self.mul_factor
+        # With an integer conv kernel, if the weight zero point is not zero,
+        # it is required an extra input channel that is equal to the per-channel zero point of the weights
         quant_weight = quantize(self.linear.weight, self.weight_scale, self.weight_zp, is_asym=True)
         quant_input = quantize(scaled_x, self.input_scale, self.input_zp, is_asym=False)
         dequantized_weight = dequantize(quant_weight, self.weight_scale, self.weight_zp)
     def forward(self, x):
         scaled_x = x * self.mul_factor
+        # With an integer conv kernel, if the weight zero point is not zero,
+        # it is required an extra input channel that is equal to the per-channel zero point of the weights
         quant_weight = quantize(self.linear.weight, self.weight_scale, self.weight_zp, is_asym=True)
         quant_input = quantize(scaled_x, self.input_scale, self.input_zp, is_asym=False)
         dequantized_weight = dequantize(quant_weight, self.weight_scale, self.weight_zp)