Upload export_qwen2_wc.py with huggingface_hub
Browse files- export_qwen2_wc.py +131 -2
export_qwen2_wc.py
CHANGED
@@ -120,7 +120,7 @@ class SliceUpdateQwen2Attention(Qwen2Attention):
|
|
120 |
L, S = query_states.size(-2), key_states.size(-2)
|
121 |
causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
|
122 |
|
123 |
-
print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
|
124 |
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
125 |
query_states,
|
126 |
key_states,
|
@@ -174,6 +174,46 @@ class StatefulQwen2ForCausalLM(torch.nn.Module):
|
|
174 |
use_cache=True,
|
175 |
).logits
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
def export() -> None:
|
179 |
# Construct model from transformers and trace to TorchScript
|
@@ -217,6 +257,8 @@ def export() -> None:
|
|
217 |
minimum_deployment_target=ct.target.iOS18,
|
218 |
skip_model_load=True,
|
219 |
)
|
|
|
|
|
220 |
|
221 |
# Block-wise quantize model weights to int4
|
222 |
op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
@@ -232,4 +274,91 @@ def export() -> None:
|
|
232 |
|
233 |
|
234 |
if __name__ == "__main__":
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
L, S = query_states.size(-2), key_states.size(-2)
|
121 |
causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
|
122 |
|
123 |
+
#print(f"KEVINDEBUG query_states:{query_states.shape} key_states:{key_states.shape} value_states:{value_states.shape} causal_mask:{causal_mask}")
|
124 |
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
125 |
query_states,
|
126 |
key_states,
|
|
|
174 |
use_cache=True,
|
175 |
).logits
|
176 |
|
177 |
+
def generate() -> None:
|
178 |
+
# Construct model from transformers and trace to TorchScript
|
179 |
+
max_context_size: int = 2048
|
180 |
+
torch_model = StatefulQwen2ForCausalLM(MODEL_ID, max_context_size=max_context_size)
|
181 |
+
torch_model.eval()
|
182 |
+
|
183 |
+
input_ids: torch.Tensor = torch.tensor([[7985, 264, 32794, 911, 60249]], dtype=torch.int32)
|
184 |
+
causal_mask: torch.Tensor = torch.ones((1, 1, 1, input_ids.shape[-1] + 1), dtype=torch.float32)
|
185 |
+
|
186 |
+
# Set the output length
|
187 |
+
output_length = 20
|
188 |
+
|
189 |
+
# Initialize the output tensor
|
190 |
+
output_tokens = input_ids
|
191 |
+
|
192 |
+
# Loop until the desired output length is reached
|
193 |
+
while output_tokens.shape[-1] < output_length + input_ids.shape[-1]:
|
194 |
+
# Compute the past seen tokens used for updating key/value cache slices
|
195 |
+
#torch_model.kv_cache.past_seen_tokens = causal_mask.shape[-1] - output_tokens.shape[-1]
|
196 |
+
|
197 |
+
# Get the model output
|
198 |
+
model_inp = output_tokens[:, -20:]
|
199 |
+
print(f"KEVINDEBUG model_inp: {model_inp} causal_mask: {causal_mask}")
|
200 |
+
output = torch_model(output_tokens[:, -20:], causal_mask) # Start with a sub-squence that long so need multiple previous when size only very lwo larger later same past arg a so try keeping right padded!
|
201 |
+
|
202 |
+
# Get the most likely token IDs
|
203 |
+
output_ids = torch.argmax(output, dim=-1)
|
204 |
+
|
205 |
+
# Append the generated token IDs to the output tensor
|
206 |
+
output_tokens = torch.cat((output_tokens, output_ids[:, -1, None]), dim=-1)
|
207 |
+
print(f"KEVINDEBUG output_tokens: {output_tokens}")
|
208 |
+
|
209 |
+
# Update the causal mask
|
210 |
+
causal_mask = torch.ones((1, 1, 1, output_tokens.shape[-1] + 1), dtype=torch.float32)
|
211 |
+
|
212 |
+
# Decode output tokens using the tokenizer
|
213 |
+
from transformers import AutoTokenizer
|
214 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
215 |
+
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
216 |
+
print(f"input : {tokenizer.decode(input_ids[0])} output: {decoded_output}")
|
217 |
|
218 |
def export() -> None:
|
219 |
# Construct model from transformers and trace to TorchScript
|
|
|
257 |
minimum_deployment_target=ct.target.iOS18,
|
258 |
skip_model_load=True,
|
259 |
)
|
260 |
+
mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
261 |
+
mlmodel_fp16.save("StatefulQwen2_0_5_BInstructFP16.mlpackage")
|
262 |
|
263 |
# Block-wise quantize model weights to int4
|
264 |
op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
|
|
274 |
|
275 |
|
276 |
if __name__ == "__main__":
|
277 |
+
generate()
|
278 |
+
|
279 |
+
###
|
280 |
+
#(venv) kevin36524@instance-20240808-212842:~$ python export_qwen2_wc.py
|
281 |
+
#Failed to load _MLModelProxy: No module named 'coremltools.libcoremlpython'
|
282 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249]], dtype=torch.int32) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1.]]]])
|
283 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11]])
|
284 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
|
285 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689]])
|
286 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
287 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11]])
|
288 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
289 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080]])
|
290 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
291 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389]])
|
292 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
293 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
294 |
+
# 1181]])
|
295 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
296 |
+
# 1181]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
297 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
298 |
+
# 1181, 17646]])
|
299 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
300 |
+
# 1181, 17646]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
301 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
302 |
+
# 1181, 17646, 11]])
|
303 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
304 |
+
# 1181, 17646, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
305 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
306 |
+
# 1181, 17646, 11, 7674]])
|
307 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
308 |
+
# 1181, 17646, 11, 7674]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
309 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
310 |
+
# 1181, 17646, 11, 7674, 11]])
|
311 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
312 |
+
# 1181, 17646, 11, 7674, 11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
313 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
314 |
+
# 1181, 17646, 11, 7674, 11, 323]])
|
315 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
316 |
+
# 1181, 17646, 11, 7674, 11, 323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
317 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
318 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005]])
|
319 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
320 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
321 |
+
# 1.]]]])
|
322 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
323 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13]])
|
324 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
325 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
326 |
+
# 1., 1.]]]])
|
327 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
328 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443]])
|
329 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
330 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
331 |
+
# 1., 1., 1.]]]])
|
332 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
333 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020]])
|
334 |
+
#KEVINDEBUG model_inp: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
335 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
336 |
+
# 1., 1., 1., 1.]]]])
|
337 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
338 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
|
339 |
+
# 40445]])
|
340 |
+
#KEVINDEBUG model_inp: tensor([[ 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389, 1181,
|
341 |
+
# 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020, 40445]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
342 |
+
# 1., 1., 1., 1., 1.]]]])
|
343 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
344 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
|
345 |
+
# 40445, 323]])
|
346 |
+
#KEVINDEBUG model_inp: tensor([[32794, 911, 60249, 11, 17689, 11, 21080, 389, 1181, 17646,
|
347 |
+
# 11, 7674, 11, 323, 35005, 13, 5443, 42020, 40445, 323]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
348 |
+
# 1., 1., 1., 1., 1., 1.]]]])
|
349 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
350 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
|
351 |
+
# 40445, 323, 32976]])
|
352 |
+
#KEVINDEBUG model_inp: tensor([[ 911, 60249, 11, 17689, 11, 21080, 389, 1181, 17646, 11,
|
353 |
+
# 7674, 11, 323, 35005, 13, 5443, 42020, 40445, 323, 32976]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
354 |
+
# 1., 1., 1., 1., 1., 1., 1.]]]])
|
355 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
356 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
|
357 |
+
# 40445, 323, 32976, 7987]])
|
358 |
+
#KEVINDEBUG model_inp: tensor([[60249, 11, 17689, 11, 21080, 389, 1181, 17646, 11, 7674,
|
359 |
+
# 11, 323, 35005, 13, 5443, 42020, 40445, 323, 32976, 7987]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
360 |
+
# 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
361 |
+
#KEVINDEBUG output_tokens: tensor([[ 7985, 264, 32794, 911, 60249, 11, 17689, 11, 21080, 389,
|
362 |
+
# 1181, 17646, 11, 7674, 11, 323, 35005, 13, 5443, 42020,
|
363 |
+
# 40445, 323, 32976, 7987, 11]])
|
364 |
+
#input : Write a poem about Valencia output: Write a poem about Valencia, Spain, focusing on its architecture, culture, and cuisine. Use vivid imagery and vibrant colors,
|