kevin36524 commited on
Commit
934151d
·
verified ·
1 Parent(s): edd67ba

Upload export_qwen2_wc.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. export_qwen2_wc.py +131 -2
export_qwen2_wc.py CHANGED
@@ -120,7 +120,7 @@ class SliceUpdateQwen2Attention(Qwen2Attention):
120
  L, S = query_states.size(-2), key_states.size(-2)
121
  causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
122
 
123
- print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
124
  attn_output = torch.nn.functional.scaled_dot_product_attention(
125
  query_states,
126
  key_states,
@@ -174,6 +174,46 @@ class StatefulQwen2ForCausalLM(torch.nn.Module):
174
  use_cache=True,
175
  ).logits
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  def export() -> None:
179
  # Construct model from transformers and trace to TorchScript
@@ -217,6 +257,8 @@ def export() -> None:
217
  minimum_deployment_target=ct.target.iOS18,
218
  skip_model_load=True,
219
  )
 
 
220
 
221
  # Block-wise quantize model weights to int4
222
  op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
@@ -232,4 +274,91 @@ def export() -> None:
232
 
233
 
234
  if __name__ == "__main__":
235
- export()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  L, S = query_states.size(-2), key_states.size(-2)
121
  causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
122
 
123
+ #print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
124
  attn_output = torch.nn.functional.scaled_dot_product_attention(
125
  query_states,
126
  key_states,
 
174
  use_cache=True,
175
  ).logits
176
 
177
+ def generate() -> None:
178
+ # Construct model from transformers and trace to TorchScript
179
+ max_context_size: int = 2048
180
+ torch_model = StatefulQwen2ForCausalLM(MODEL_ID, max_context_size=max_context_size)
181
+ torch_model.eval()
182
+
183
+ input_ids: torch.Tensor = torch.tensor([[7985, 264, 32794, 911, 60249]], dtype=torch.int32)
184
+ causal_mask: torch.Tensor = torch.ones((1, 1, 1, input_ids.shape[-1] + 1), dtype=torch.float32)
185
+
186
+ # Set the output length
187
+ output_length = 20
188
+
189
+ # Initialize the output tensor
190
+ output_tokens = input_ids
191
+
192
+ # Loop until the desired output length is reached
193
+ while output_tokens.shape[-1] < output_length + input_ids.shape[-1]:
194
+ # Compute the past seen tokens used for updating key/value cache slices
195
+ #torch_model.kv_cache.past_seen_tokens = causal_mask.shape[-1] - output_tokens.shape[-1]
196
+
197
+ # Get the model output
198
+ model_inp = output_tokens[:, -20:]
199
+ print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
200
+ output = torch_model(output_tokens[:, -20:], causal_mask) # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
201
+
202
+ # Get the most likely token IDs
203
+ output_ids = torch.argmax(output, dim=-1)
204
+
205
+ # Append the generated token IDs to the output tensor
206
+ output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
207
+ print(f"KEVINDEBUG output_tokens: {output_tokens}")
208
+
209
+ # Update the causal mask
210
+ causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
211
+
212
+ # Decode output tokens using the tokenizer
213
+ from transformers import AutoTokenizer
214
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
215
+ decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
216
+ print(f"input : {tokenizer.decode(input_ids[0])} output: {decoded_output}")
217
 
218
  def export() -> None:
219
  # Construct model from transformers and trace to TorchScript
 
257
  minimum_deployment_target=ct.target.iOS18,
258
  skip_model_load=True,
259
  )
260
+ mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
261
+ mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
262
 
263
  # Block-wise quantize model weights to int4
264
  op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
 
274
 
275
 
276
  if __name__ == "__main__":
277
+ generate()
278
+
279
+ ###
280
+ #(venv) kevin36524@instance-20240808-212842:~$ python export_qwen2_wc.py
281
+ #Failed to load _MLModelProxy: No module named 'coremltools.libcoremlpython'
282
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249]], dtype=torch.int32) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1.]]]])
283
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11]])
284
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
285
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689]])
286
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
287
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11]])
288
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
289
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080]])
290
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
291
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389]])
292
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
293
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
294
+ # 1181]])
295
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
296
+ # 1181]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
297
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
298
+ # 1181, 17646]])
299
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
300
+ # 1181, 17646]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
301
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
302
+ # 1181, 17646, 11]])
303
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
304
+ # 1181, 17646, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
305
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
306
+ # 1181, 17646, 11, 7674]])
307
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
308
+ # 1181, 17646, 11, 7674]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
309
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
310
+ # 1181, 17646, 11, 7674, 11]])
311
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
312
+ # 1181, 17646, 11, 7674, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
313
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
314
+ # 1181, 17646, 11, 7674, 11, 323]])
315
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
316
+ # 1181, 17646, 11, 7674, 11, 323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
317
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
318
+ # 1181, 17646, 11, 7674, 11, 323, 35005]])
319
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
320
+ # 1181, 17646, 11, 7674, 11, 323, 35005]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
321
+ # 1.]]]])
322
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
323
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13]])
324
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
325
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
326
+ # 1., 1.]]]])
327
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
328
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443]])
329
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
330
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
331
+ # 1., 1., 1.]]]])
332
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
333
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020]])
334
+ #KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
335
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
336
+ # 1., 1., 1., 1.]]]])
337
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
338
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
339
+ # 40445]])
340
+ #KEVINDEBUG model_inp: tensor([[ 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389, 1181,
341
+ # 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020, 40445]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
342
+ # 1., 1., 1., 1., 1.]]]])
343
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
344
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
345
+ # 40445, 323]])
346
+ #KEVINDEBUG model_inp: tensor([[32794, 911, 60249, 11, 17689, 11, 21080, 389, 1181, 17646,
347
+ # 11, 7674, 11, 323, 35005, 13, 5443, 42020, 40445, 323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
348
+ # 1., 1., 1., 1., 1., 1.]]]])
349
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
350
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
351
+ # 40445, 323, 32976]])
352
+ #KEVINDEBUG model_inp: tensor([[ 911, 60249, 11, 17689, 11, 21080, 389, 1181, 17646, 11,
353
+ # 7674, 11, 323, 35005, 13, 5443, 42020, 40445, 323, 32976]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
354
+ # 1., 1., 1., 1., 1., 1., 1.]]]])
355
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
356
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
357
+ # 40445, 323, 32976, 7987]])
358
+ #KEVINDEBUG model_inp: tensor([[60249, 11, 17689, 11, 21080, 389, 1181, 17646, 11, 7674,
359
+ # 11, 323, 35005, 13, 5443, 42020, 40445, 323, 32976, 7987]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
360
+ # 1., 1., 1., 1., 1., 1., 1., 1.]]]])
361
+ #KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
362
+ # 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
363
+ # 40445, 323, 32976, 7987, 11]])
364
+ #input : Write a poem about Valencia output: Write a poem about Valencia, Spain, focusing on its architecture, culture, and cuisine. Use vivid imagery and vibrant colors,