kevin36524
/

qwen2_0_5_instruct_int4_coreml

Core ML

Model card Files Files and versions Community

kevin36524 commited on Aug 13, 2024

Commit

ae5bac8

verified ·

1 Parent(s): 8841dcd

Upload export_llama.py with huggingface_hub

Browse files

Files changed (1) hide show

export_llama.py +14 -81

export_llama.py CHANGED Viewed

@@ -226,9 +226,15 @@ def export() -> None:
     max_context_size: int = 2048
     torch_model = StatefulLlamaForCausalLM(MODEL_ID, max_context_size=max_context_size)
     torch_model.eval()
-    input_ids: torch.Tensor = torch.tensor([[19161,   253,  8216,   335, 10910,   216]], dtype=torch.int32)
-    #input_ids: torch.Tensor = torch.tensor([[ 11 ]], dtype=torch.int32)
-    causal_mask: torch.Tensor = torch.ones((1, 1, 1, 7), dtype=torch.float32)
     traced_model = torch.jit.trace(torch_model, [input_ids, causal_mask])
     # Convert traced TorchScript to Core ML format
@@ -263,8 +269,8 @@ def export() -> None:
         minimum_deployment_target=ct.target.iOS18,
         skip_model_load=True,
     )
-    mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
-    mlmodel_fp16.save("StatefulSmolLM_360M_InstructFP16.mlpackage")
     # Block-wise quantize model weights to int4
     op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
@@ -276,82 +282,9 @@ def export() -> None:
     config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
     mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
     mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
-    mlmodel_int4.save("StatefulSmolLM_360M_InstructInt4.mlpackage")
 if __name__ == "__main__":
-    generate()
-###
-#KEVINDEBUG model_inp: tensor([[ 8420,   374,   264, 32794,   911, 60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11]])
-#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689]])
-#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13]])
-#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084]])
-#KEVINDEBUG model_inp: tensor([[1084]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374]])
-#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264]])
-#KEVINDEBUG model_inp: tensor([[264]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794]])
-#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911]])
-#KEVINDEBUG model_inp: tensor([[911]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279]])
-#KEVINDEBUG model_inp: tensor([[279]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283]])
-#KEVINDEBUG model_inp: tensor([[3283]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315]])
-#KEVINDEBUG model_inp: tensor([[315]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249]])
-#KEVINDEBUG model_inp: tensor([[60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11]])
-#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689]])
-#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13]])
-#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576]])
-#KEVINDEBUG model_inp: tensor([[576]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794]])
-#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374]])
-#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374,  5326]])
-#KEVINDEBUG model_inp: tensor([[5326]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-#           1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
-#KEVINDEBUG output_tokens: tensor([[ 8420,   374,   264, 32794,   911, 60249,    11, 17689,    13,  1084,
-#           374,   264, 32794,   911,   279,  3283,   315, 60249,    11, 17689,
-#            13,   576, 32794,   374,  5326,   304]])
-#input : Here is a poem about Valencia output: Here is a poem about Valencia, Spain. It is a poem about the city of Valencia, Spain. The poem is written in

     max_context_size: int = 2048
     torch_model = StatefulLlamaForCausalLM(MODEL_ID, max_context_size=max_context_size)
     torch_model.eval()
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    #initial_prompt = "Write a christmas Carol"
+    initial_prompt = "Write a poem on Apple "
+    input_ids = tokenizer(initial_prompt, return_tensors='pt').input_ids
+    causal_mask: torch.Tensor = torch.ones((1, 1, 1, input_ids.shape[-1] + 1), dtype=torch.float32)
     traced_model = torch.jit.trace(torch_model, [input_ids, causal_mask])
     # Convert traced TorchScript to Core ML format
         minimum_deployment_target=ct.target.iOS18,
         skip_model_load=True,
     )
+    #mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
+    #mlmodel_fp16.save("Stateful_llama_3_1_8B_InstructFP16.mlpackage")
     # Block-wise quantize model weights to int4
     op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
     config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
     mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
     mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
+    mlmodel_int4.save("Stateful_llama_3_1_8B_InstructInt4.mlpackage")
 if __name__ == "__main__":
+    export()