kevin36524
/

qwen2_0_5_instruct_int4_coreml

Core ML

Model card Files Files and versions Community

kevin36524 commited on Aug 9, 2024

Commit

934151d

verified ·

1 Parent(s): edd67ba

Upload export_qwen2_wc.py with huggingface_hub

Browse files

Files changed (1) hide show

export_qwen2_wc.py +131 -2

export_qwen2_wc.py CHANGED Viewed

@@ -120,7 +120,7 @@ class SliceUpdateQwen2Attention(Qwen2Attention):
             L, S = query_states.size(-2), key_states.size(-2)
             causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-        print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
@@ -174,6 +174,46 @@ class StatefulQwen2ForCausalLM(torch.nn.Module):
             use_cache=True,
         ).logits
 def export() -> None:
     # Construct model from transformers and trace to TorchScript
@@ -217,6 +257,8 @@ def export() -> None:
         minimum_deployment_target=ct.target.iOS18,
         skip_model_load=True,
     )
     # Block-wise quantize model weights to int4
     op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
@@ -232,4 +274,91 @@ def export() -> None:
 if __name__ == "__main__":
-    export()

             L, S = query_states.size(-2), key_states.size(-2)
             causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        #print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             use_cache=True,
         ).logits
+def generate() -> None:
+    # Construct model from transformers and trace to TorchScript
+    max_context_size: int = 2048
+    torch_model = StatefulQwen2ForCausalLM(MODEL_ID, max_context_size=max_context_size)
+    torch_model.eval()
+    input_ids: torch.Tensor = torch.tensor([[7985, 264, 32794, 911, 60249]], dtype=torch.int32)
+    causal_mask: torch.Tensor = torch.ones((1, 1, 1, input_ids.shape[-1] + 1), dtype=torch.float32)
+    # Set the output length
+    output_length = 20
+    # Initialize the output tensor
+    output_tokens = input_ids
+    # Loop until the desired output length is reached
+    while output_tokens.shape[-1] < output_length + input_ids.shape[-1]:
+        # Compute the past seen tokens used for updating key/value cache slices
+        #torch_model.kv_cache.past_seen_tokens = causal_mask.shape[-1] - output_tokens.shape[-1]
+        # Get the model output
+        model_inp = output_tokens[:, -20:]
+        print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
+        output = torch_model(output_tokens[:, -20:], causal_mask)  # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
+        # Get the most likely token IDs
+        output_ids = torch.argmax(output, dim=-1)
+        # Append the generated token IDs to the output tensor
+        output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
+        print(f"KEVINDEBUG output_tokens: {output_tokens}")
+        # Update the causal mask
+        causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
+    # Decode output tokens using the tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+    print(f"input : {tokenizer.decode(input_ids[0])} output: {decoded_output}")
 def export() -> None:
     # Construct model from transformers and trace to TorchScript
         minimum_deployment_target=ct.target.iOS18,
         skip_model_load=True,
     )
+    mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
+    mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
     # Block-wise quantize model weights to int4
     op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
 if __name__ == "__main__":
+    generate()
+###
+#(venv) kevin36524@instance-20240808-212842:~$ python export_qwen2_wc.py
+#Failed to load _MLModelProxy: No module named 'coremltools.libcoremlpython'
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249]], dtype=torch.int32) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020]])
+#KEVINDEBUG model_inp: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020,
+#         40445]])
+#KEVINDEBUG model_inp: tensor([[  264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,  1181,
+#         17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020, 40445]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020,
+#         40445,   323]])
+#KEVINDEBUG model_inp: tensor([[32794,   911, 60249,    11, 17689,    11, 21080,   389,  1181, 17646,
+#            11,  7674,    11,   323, 35005,    13,  5443, 42020, 40445,   323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020,
+#         40445,   323, 32976]])
+#KEVINDEBUG model_inp: tensor([[  911, 60249,    11, 17689,    11, 21080,   389,  1181, 17646,    11,
+#          7674,    11,   323, 35005,    13,  5443, 42020, 40445,   323, 32976]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020,
+#         40445,   323, 32976,  7987]])
+#KEVINDEBUG model_inp: tensor([[60249,    11, 17689,    11, 21080,   389,  1181, 17646,    11,  7674,
+#            11,   323, 35005,    13,  5443, 42020, 40445,   323, 32976,  7987]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+#           1., 1., 1., 1., 1., 1., 1., 1.]]]])
+#KEVINDEBUG output_tokens: tensor([[ 7985,   264, 32794,   911, 60249,    11, 17689,    11, 21080,   389,
+#          1181, 17646,    11,  7674,    11,   323, 35005,    13,  5443, 42020,
+#         40445,   323, 32976,  7987,    11]])
+#input : Write a poem about Valencia output: Write a poem about Valencia, Spain, focusing on its architecture, culture, and cuisine. Use vivid imagery and vibrant colors,