Update export_qwen2_wc.py
Browse files- export_qwen2_wc.py +13 -87
export_qwen2_wc.py
CHANGED
@@ -207,7 +207,7 @@ def generate() -> None:
|
|
207 |
if is_first_run:
|
208 |
model_inp = input_ids
|
209 |
is_first_run = False
|
210 |
-
print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
|
211 |
output = torch_model(model_inp, causal_mask) # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
|
212 |
|
213 |
# Get the most likely token IDs
|
@@ -215,7 +215,7 @@ def generate() -> None:
|
|
215 |
|
216 |
# Append the generated token IDs to the output tensor
|
217 |
output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
|
218 |
-
print(f"KEVINDEBUG output_tokens: {output_tokens}")
|
219 |
|
220 |
# Update the causal mask
|
221 |
causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
|
@@ -269,91 +269,17 @@ def export() -> None:
|
|
269 |
mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
|
270 |
|
271 |
# Block-wise quantize model weights to int4
|
272 |
-
op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
)
|
278 |
-
config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
|
279 |
-
mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
|
280 |
-
mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
281 |
-
mlmodel_int4.save("StatefulQwen2_0_5_BInstructInt4.mlpackage")
|
282 |
|
283 |
|
284 |
if __name__ == "__main__":
|
285 |
-
|
286 |
-
|
287 |
-
###
|
288 |
-
#KEVINDEBUG model_inp: tensor([[ 8420, 374, 264, 32794, 911, 60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
|
289 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11]])
|
290 |
-
#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
291 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689]])
|
292 |
-
#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
293 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13]])
|
294 |
-
#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
295 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084]])
|
296 |
-
#KEVINDEBUG model_inp: tensor([[1084]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
297 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
298 |
-
# 374]])
|
299 |
-
#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
300 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
301 |
-
# 374, 264]])
|
302 |
-
#KEVINDEBUG model_inp: tensor([[264]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
303 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
304 |
-
# 374, 264, 32794]])
|
305 |
-
#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
306 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
307 |
-
# 374, 264, 32794, 911]])
|
308 |
-
#KEVINDEBUG model_inp: tensor([[911]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
309 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
310 |
-
# 374, 264, 32794, 911, 279]])
|
311 |
-
#KEVINDEBUG model_inp: tensor([[279]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
312 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
313 |
-
# 374, 264, 32794, 911, 279, 3283]])
|
314 |
-
#KEVINDEBUG model_inp: tensor([[3283]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
315 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
316 |
-
# 374, 264, 32794, 911, 279, 3283, 315]])
|
317 |
-
#KEVINDEBUG model_inp: tensor([[315]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
318 |
-
# 1.]]]])
|
319 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
320 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249]])
|
321 |
-
#KEVINDEBUG model_inp: tensor([[60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
322 |
-
# 1., 1.]]]])
|
323 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
324 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11]])
|
325 |
-
#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
326 |
-
# 1., 1., 1.]]]])
|
327 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
328 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689]])
|
329 |
-
#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
330 |
-
# 1., 1., 1., 1.]]]])
|
331 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
332 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
333 |
-
# 13]])
|
334 |
-
#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
335 |
-
# 1., 1., 1., 1., 1.]]]])
|
336 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
337 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
338 |
-
# 13, 576]])
|
339 |
-
#KEVINDEBUG model_inp: tensor([[576]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
340 |
-
# 1., 1., 1., 1., 1., 1.]]]])
|
341 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
342 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
343 |
-
# 13, 576, 32794]])
|
344 |
-
#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
345 |
-
# 1., 1., 1., 1., 1., 1., 1.]]]])
|
346 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
347 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
348 |
-
# 13, 576, 32794, 374]])
|
349 |
-
#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
350 |
-
# 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
351 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
352 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
353 |
-
# 13, 576, 32794, 374, 5326]])
|
354 |
-
#KEVINDEBUG model_inp: tensor([[5326]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
355 |
-
# 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
356 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
357 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
358 |
-
# 13, 576, 32794, 374, 5326, 304]])
|
359 |
-
#input : Here is a poem about Valencia output: Here is a poem about Valencia, Spain. It is a poem about the city of Valencia, Spain. The poem is written in
|
|
|
207 |
if is_first_run:
|
208 |
model_inp = input_ids
|
209 |
is_first_run = False
|
210 |
+
#print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
|
211 |
output = torch_model(model_inp, causal_mask) # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
|
212 |
|
213 |
# Get the most likely token IDs
|
|
|
215 |
|
216 |
# Append the generated token IDs to the output tensor
|
217 |
output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
|
218 |
+
#print(f"KEVINDEBUG output_tokens: {output_tokens}")
|
219 |
|
220 |
# Update the causal mask
|
221 |
causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
|
|
|
269 |
mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
|
270 |
|
271 |
# Block-wise quantize model weights to int4
|
272 |
+
# op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
273 |
+
# mode="linear_symmetric",
|
274 |
+
# dtype="int4",
|
275 |
+
# granularity="per_block",
|
276 |
+
# block_size=32,
|
277 |
+
# )
|
278 |
+
# config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
|
279 |
+
# mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
|
280 |
+
# mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
281 |
+
# mlmodel_int4.save("StatefulQwen2_0_5_BInstructInt4.mlpackage")
|
282 |
|
283 |
|
284 |
if __name__ == "__main__":
|
285 |
+
export()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|