kevin36524
/

qwen2_0_5_instruct_int4_coreml

Core ML

Model card Files Files and versions Community

kevin36524 commited on Aug 12, 2024

Commit

b973cbb

verified ·

1 Parent(s): 7c62b77

Update export_qwen2_wc.py

Browse files

Files changed (1) hide show

export_qwen2_wc.py +13 -87

export_qwen2_wc.py CHANGED Viewed

@@ -207,7 +207,7 @@ def generate() -> None:
         if is_first_run:
             model_inp = input_ids
             is_first_run = False
-        print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
         output = torch_model(model_inp, causal_mask)  # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
         # Get the most likely token IDs
@@ -215,7 +215,7 @@ def generate() -> None:
         # Append the generated token IDs to the output tensor
         output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
-        print(f"KEVINDEBUG output_tokens: {output_tokens}")
         # Update the causal mask
         causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
@@ -269,91 +269,17 @@ def export() -> None:
     mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
     # Block-wise quantize model weights to int4
-    op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
-        mode="linear_symmetric",
-        dtype="int4",
-        granularity="per_block",
-        block_size=32,
-    )
-    config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
-    mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
-    mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
-    mlmodel_int4.save("StatefulQwen2_0_5_BInstructInt4.mlpackage")
 if __name__ == "__main__":
-    generate()
-###
-#KEVINDEBUG model_inp: tensor([[ 8420,   374,   264, 32794,   911, 60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11]])
-#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689]])
-#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13]])
-#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084]])
-#KEVINDEBUG model_inp: tensor([[1084]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374]])
-#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264]])
-#KEVINDEBUG model_inp: tensor([[264]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794]])
-#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911]])
-#KEVINDEBUG model_inp: tensor([[911]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279]])
-#KEVINDEBUG model_inp: tensor([[279]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283]])
-#KEVINDEBUG model_inp: tensor([[3283]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315]])
-#KEVINDEBUG model_inp: tensor([[315]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249]])
-#KEVINDEBUG model_inp: tensor([[60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11]])
-#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689]])
-#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13]])
-#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576]])
-#KEVINDEBUG model_inp: tensor([[576]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794]])
-#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374]])
-#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374,  5326]])
-#KEVINDEBUG model_inp: tensor([[5326]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374,  5326,   304]])
-#input : Here is a poem about Valencia output: Here is a poem about Valencia, Spain. It is a poem about the city of Valencia, Spain. The poem is written in

         if is_first_run:
             model_inp = input_ids
             is_first_run = False
+        #print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
         output = torch_model(model_inp, causal_mask)  # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
         # Get the most likely token IDs
         # Append the generated token IDs to the output tensor
         output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
+        #print(f"KEVINDEBUG output_tokens: {output_tokens}")
         # Update the causal mask
         causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
     mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
     # Block-wise quantize model weights to int4
+    # op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
+    #     mode="linear_symmetric",
+    #     dtype="int4",
+    #     granularity="per_block",
+    #     block_size=32,
+    # )
+    # config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
+    # mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
+    # mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
+    # mlmodel_int4.save("StatefulQwen2_0_5_BInstructInt4.mlpackage")
 if __name__ == "__main__":
+    export()