BigMaoGoGoGo
commited on
Commit
•
ac792e3
1
Parent(s):
8fe54fa
fix gpu cache
Browse files- gptq_quantization.py +39 -47
gptq_quantization.py
CHANGED
@@ -138,38 +138,34 @@ class GPTQLayerWrapper:
|
|
138 |
|
139 |
if is_transformer_conv1d(self.layer):
|
140 |
Q = Q.t()
|
141 |
-
|
142 |
-
|
|
|
|
|
143 |
del self.H
|
144 |
-
if torch.cuda.is_available():
|
145 |
-
torch.cuda.empty_cache()
|
146 |
-
|
147 |
-
def release_gpu_memory(self):
|
148 |
-
if hasattr(self, "H"):
|
149 |
-
del self.H
|
150 |
|
151 |
|
152 |
class GPTQBlockWrapper:
|
153 |
-
def __init__(self,
|
154 |
self.layer_wrappers = {}
|
155 |
self.hook_handles = []
|
156 |
-
#
|
157 |
self.order = 0
|
158 |
-
self.
|
159 |
|
160 |
def get_hook(layer_name):
|
161 |
def record_hook(_, x):
|
162 |
self.layer_wrappers[layer_name].record_h(x[0])
|
163 |
return record_hook
|
164 |
|
165 |
-
for layer_name, layer in
|
166 |
if isinstance(layer, tuple(QUANT_LAYERS)):
|
167 |
-
full_layer_name = f"{
|
168 |
self.layer_wrappers[full_layer_name] = GPTQLayerWrapper(full_layer_name, layer, weight_bit_width)
|
169 |
handle = layer.register_forward_pre_hook(get_hook(full_layer_name))
|
170 |
self.hook_handles.append(handle)
|
171 |
|
172 |
-
def
|
173 |
for _, wrapper in self.layer_wrappers.items():
|
174 |
wrapper.quant_weight()
|
175 |
|
@@ -190,10 +186,6 @@ class GPTQBlockWrapper:
|
|
190 |
for n, l in self.layer_wrappers.items():
|
191 |
l.is_record = False
|
192 |
|
193 |
-
def release_gpu_memory(self):
|
194 |
-
for _, wrapper in self.layer_wrappers.items():
|
195 |
-
wrapper.release_gpu_memory()
|
196 |
-
|
197 |
|
198 |
class GPTQuantizer:
|
199 |
def __init__(self, block_type: Optional[List[type]] = None):
|
@@ -207,19 +199,13 @@ class GPTQuantizer:
|
|
207 |
child_prefix = f"{prefix}.{name}" if prefix else name
|
208 |
if isinstance(child, tuple(self.block_type)):
|
209 |
self.gptq_block_wrappers[name] = GPTQBlockWrapper(child_prefix, child, weight_bit_width)
|
210 |
-
LOGGER.debug(f"Calibrate
|
211 |
else:
|
212 |
wrap_block(child, child_prefix)
|
213 |
|
214 |
wrap_block(model)
|
215 |
return model
|
216 |
|
217 |
-
def quantize(self, model: nn.Module):
|
218 |
-
for _, module_wrapper in self.gptq_block_wrappers.items():
|
219 |
-
module_wrapper.quant_module()
|
220 |
-
|
221 |
-
return model
|
222 |
-
|
223 |
@property
|
224 |
def calibration_iters(self):
|
225 |
return len(self.gptq_block_wrappers)
|
@@ -230,56 +216,59 @@ class GPTQuantizer:
|
|
230 |
record_handles = []
|
231 |
orders = {}
|
232 |
try:
|
233 |
-
def get_record_order_hook(
|
234 |
def record_hook(*args, **kwargs):
|
235 |
nonlocal counter
|
236 |
-
if
|
237 |
-
orders[
|
238 |
counter += 1
|
239 |
return record_hook
|
240 |
|
241 |
-
for
|
242 |
# disable the record
|
243 |
-
for _, layer_wrapper in
|
244 |
layer_wrapper.is_record = False
|
245 |
|
246 |
-
|
247 |
-
handles =
|
248 |
record_handles.append(handles)
|
249 |
yield
|
250 |
except Exception as e:
|
251 |
logging.warning(e)
|
252 |
finally:
|
253 |
-
for
|
254 |
-
self.gptq_block_wrappers[
|
255 |
|
256 |
for h in record_handles:
|
257 |
h.remove()
|
258 |
|
259 |
-
for
|
260 |
# disable the record
|
261 |
-
for _, layer_wrapper in
|
262 |
layer_wrapper.is_record = True
|
263 |
|
264 |
|
265 |
@contextlib.contextmanager
|
266 |
def start_calib_iter(self, i):
|
267 |
assert i < len(self.gptq_block_wrappers)
|
268 |
-
|
269 |
try:
|
270 |
-
for _,
|
271 |
-
if
|
272 |
-
|
273 |
-
|
274 |
else:
|
275 |
-
|
276 |
yield
|
277 |
finally:
|
278 |
-
|
279 |
|
280 |
-
def
|
281 |
-
|
282 |
-
|
|
|
|
|
|
|
283 |
|
284 |
torch.cuda.empty_cache()
|
285 |
|
@@ -301,10 +290,12 @@ def gptq_quantize(model, tokenizer, weight_bit_width, calib_data):
|
|
301 |
calib_model = quantizer.wrap_model(model, weight_bit_width)
|
302 |
with quantizer.record_order():
|
303 |
calib_model.chat(tokenizer, calib_data[0], history=[])
|
|
|
304 |
logging.info("Start doing calibration using GPTQ ")
|
305 |
for i in range(quantizer.calibration_iters):
|
306 |
logging.info(f"Process: {i + 1}/{quantizer.calibration_iters}")
|
307 |
# todo: should add early return to speed up the calibration
|
|
|
308 |
with quantizer.start_calib_iter(i):
|
309 |
for prompt in calib_data:
|
310 |
model.chat(tokenizer, prompt, history=[])
|
@@ -328,5 +319,6 @@ def gptq_quantize(model, tokenizer, weight_bit_width, calib_data):
|
|
328 |
)
|
329 |
parent.add_module(name_in_parent, quantized_layer)
|
330 |
|
331 |
-
|
|
|
332 |
return
|
|
|
138 |
|
139 |
if is_transformer_conv1d(self.layer):
|
140 |
Q = Q.t()
|
141 |
+
shape = self.layer.weight.shape
|
142 |
+
dtype = self.layer.weight.data.dtype
|
143 |
+
del self.layer.weight
|
144 |
+
setattr(self.layer, "weight", nn.Parameter(Q.reshape(shape).to(dtype), requires_grad=False))
|
145 |
del self.H
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
|
148 |
class GPTQBlockWrapper:
|
149 |
+
def __init__(self, block_name: str, block: nn.Module, weight_bit_width=8):
|
150 |
self.layer_wrappers = {}
|
151 |
self.hook_handles = []
|
152 |
+
# block order in the whole network
|
153 |
self.order = 0
|
154 |
+
self.block_name = block_name
|
155 |
|
156 |
def get_hook(layer_name):
|
157 |
def record_hook(_, x):
|
158 |
self.layer_wrappers[layer_name].record_h(x[0])
|
159 |
return record_hook
|
160 |
|
161 |
+
for layer_name, layer in block.named_modules():
|
162 |
if isinstance(layer, tuple(QUANT_LAYERS)):
|
163 |
+
full_layer_name = f"{block_name}.{layer_name}" if layer_name else f"{block_name}"
|
164 |
self.layer_wrappers[full_layer_name] = GPTQLayerWrapper(full_layer_name, layer, weight_bit_width)
|
165 |
handle = layer.register_forward_pre_hook(get_hook(full_layer_name))
|
166 |
self.hook_handles.append(handle)
|
167 |
|
168 |
+
def quant_block(self):
|
169 |
for _, wrapper in self.layer_wrappers.items():
|
170 |
wrapper.quant_weight()
|
171 |
|
|
|
186 |
for n, l in self.layer_wrappers.items():
|
187 |
l.is_record = False
|
188 |
|
|
|
|
|
|
|
|
|
189 |
|
190 |
class GPTQuantizer:
|
191 |
def __init__(self, block_type: Optional[List[type]] = None):
|
|
|
199 |
child_prefix = f"{prefix}.{name}" if prefix else name
|
200 |
if isinstance(child, tuple(self.block_type)):
|
201 |
self.gptq_block_wrappers[name] = GPTQBlockWrapper(child_prefix, child, weight_bit_width)
|
202 |
+
LOGGER.debug(f"Calibrate block {child_prefix} as a whole block in GPTQ")
|
203 |
else:
|
204 |
wrap_block(child, child_prefix)
|
205 |
|
206 |
wrap_block(model)
|
207 |
return model
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
@property
|
210 |
def calibration_iters(self):
|
211 |
return len(self.gptq_block_wrappers)
|
|
|
216 |
record_handles = []
|
217 |
orders = {}
|
218 |
try:
|
219 |
+
def get_record_order_hook(block_name):
|
220 |
def record_hook(*args, **kwargs):
|
221 |
nonlocal counter
|
222 |
+
if block_name not in orders:
|
223 |
+
orders[block_name] = counter
|
224 |
counter += 1
|
225 |
return record_hook
|
226 |
|
227 |
+
for block_name, block_wrapper in self.gptq_block_wrappers.items():
|
228 |
# disable the record
|
229 |
+
for _, layer_wrapper in block_wrapper.layer_wrappers.items():
|
230 |
layer_wrapper.is_record = False
|
231 |
|
232 |
+
one_layer_wrapper_in_block = list(block_wrapper.layer_wrappers.values())[0]
|
233 |
+
handles = one_layer_wrapper_in_block.layer.register_forward_pre_hook(get_record_order_hook(block_name))
|
234 |
record_handles.append(handles)
|
235 |
yield
|
236 |
except Exception as e:
|
237 |
logging.warning(e)
|
238 |
finally:
|
239 |
+
for block_name, order in orders.items():
|
240 |
+
self.gptq_block_wrappers[block_name].set_order(order)
|
241 |
|
242 |
for h in record_handles:
|
243 |
h.remove()
|
244 |
|
245 |
+
for _, block_wrapper in self.gptq_block_wrappers.items():
|
246 |
# disable the record
|
247 |
+
for _, layer_wrapper in block_wrapper.layer_wrappers.items():
|
248 |
layer_wrapper.is_record = True
|
249 |
|
250 |
|
251 |
@contextlib.contextmanager
|
252 |
def start_calib_iter(self, i):
|
253 |
assert i < len(self.gptq_block_wrappers)
|
254 |
+
target_block_wrapper = None
|
255 |
try:
|
256 |
+
for _, block_wrapper in self.gptq_block_wrappers.items():
|
257 |
+
if block_wrapper.get_order() == i:
|
258 |
+
block_wrapper.enable()
|
259 |
+
target_block_wrapper = block_wrapper
|
260 |
else:
|
261 |
+
block_wrapper.disable()
|
262 |
yield
|
263 |
finally:
|
264 |
+
target_block_wrapper.quant_block()
|
265 |
|
266 |
+
def release_reference(self):
|
267 |
+
# delete reference so that `torch.cuda.empty_cache()` can
|
268 |
+
# release all the gpu memory cache used during calibration
|
269 |
+
for _, block_wrapper in self.gptq_block_wrappers.items():
|
270 |
+
for _, layer_wrapper in block_wrapper.layer_wrappers.items():
|
271 |
+
del layer_wrapper.layer
|
272 |
|
273 |
torch.cuda.empty_cache()
|
274 |
|
|
|
290 |
calib_model = quantizer.wrap_model(model, weight_bit_width)
|
291 |
with quantizer.record_order():
|
292 |
calib_model.chat(tokenizer, calib_data[0], history=[])
|
293 |
+
|
294 |
logging.info("Start doing calibration using GPTQ ")
|
295 |
for i in range(quantizer.calibration_iters):
|
296 |
logging.info(f"Process: {i + 1}/{quantizer.calibration_iters}")
|
297 |
# todo: should add early return to speed up the calibration
|
298 |
+
# todo: add cpu offload to reduce the gpu memory requirements.
|
299 |
with quantizer.start_calib_iter(i):
|
300 |
for prompt in calib_data:
|
301 |
model.chat(tokenizer, prompt, history=[])
|
|
|
319 |
)
|
320 |
parent.add_module(name_in_parent, quantized_layer)
|
321 |
|
322 |
+
# release the memory caache during calibration
|
323 |
+
quantizer.release_reference()
|
324 |
return
|