kevin36524
commited on
Upload export_llama.py with huggingface_hub
Browse files- export_llama.py +14 -81
export_llama.py
CHANGED
@@ -226,9 +226,15 @@ def export() -> None:
|
|
226 |
max_context_size: int = 2048
|
227 |
torch_model = StatefulLlamaForCausalLM(MODEL_ID, max_context_size=max_context_size)
|
228 |
torch_model.eval()
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
traced_model = torch.jit.trace(torch_model, [input_ids, causal_mask])
|
233 |
|
234 |
# Convert traced TorchScript to Core ML format
|
@@ -263,8 +269,8 @@ def export() -> None:
|
|
263 |
minimum_deployment_target=ct.target.iOS18,
|
264 |
skip_model_load=True,
|
265 |
)
|
266 |
-
mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
267 |
-
mlmodel_fp16.save("
|
268 |
|
269 |
# Block-wise quantize model weights to int4
|
270 |
op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
@@ -276,82 +282,9 @@ def export() -> None:
|
|
276 |
config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
|
277 |
mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
|
278 |
mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
279 |
-
mlmodel_int4.save("
|
280 |
|
281 |
|
282 |
if __name__ == "__main__":
|
283 |
-
|
284 |
-
|
285 |
-
###
|
286 |
-
#KEVINDEBUG model_inp: tensor([[ 8420, 374, 264, 32794, 911, 60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1.]]]])
|
287 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11]])
|
288 |
-
#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
289 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689]])
|
290 |
-
#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
291 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13]])
|
292 |
-
#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
293 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084]])
|
294 |
-
#KEVINDEBUG model_inp: tensor([[1084]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
295 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
296 |
-
# 374]])
|
297 |
-
#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
298 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
299 |
-
# 374, 264]])
|
300 |
-
#KEVINDEBUG model_inp: tensor([[264]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
301 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
302 |
-
# 374, 264, 32794]])
|
303 |
-
#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
304 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
305 |
-
# 374, 264, 32794, 911]])
|
306 |
-
#KEVINDEBUG model_inp: tensor([[911]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
307 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
308 |
-
# 374, 264, 32794, 911, 279]])
|
309 |
-
#KEVINDEBUG model_inp: tensor([[279]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
310 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
311 |
-
# 374, 264, 32794, 911, 279, 3283]])
|
312 |
-
#KEVINDEBUG model_inp: tensor([[3283]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
313 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
314 |
-
# 374, 264, 32794, 911, 279, 3283, 315]])
|
315 |
-
#KEVINDEBUG model_inp: tensor([[315]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
316 |
-
# 1.]]]])
|
317 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
318 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249]])
|
319 |
-
#KEVINDEBUG model_inp: tensor([[60249]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
320 |
-
# 1., 1.]]]])
|
321 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
322 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11]])
|
323 |
-
#KEVINDEBUG model_inp: tensor([[11]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
324 |
-
# 1., 1., 1.]]]])
|
325 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
326 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689]])
|
327 |
-
#KEVINDEBUG model_inp: tensor([[17689]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
328 |
-
# 1., 1., 1., 1.]]]])
|
329 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
330 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
331 |
-
# 13]])
|
332 |
-
#KEVINDEBUG model_inp: tensor([[13]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
333 |
-
# 1., 1., 1., 1., 1.]]]])
|
334 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
335 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
336 |
-
# 13, 576]])
|
337 |
-
#KEVINDEBUG model_inp: tensor([[576]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
338 |
-
# 1., 1., 1., 1., 1., 1.]]]])
|
339 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
340 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
341 |
-
# 13, 576, 32794]])
|
342 |
-
#KEVINDEBUG model_inp: tensor([[32794]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
343 |
-
# 1., 1., 1., 1., 1., 1., 1.]]]])
|
344 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
345 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
346 |
-
# 13, 576, 32794, 374]])
|
347 |
-
#KEVINDEBUG model_inp: tensor([[374]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
348 |
-
# 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
349 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
350 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
351 |
-
# 13, 576, 32794, 374, 5326]])
|
352 |
-
#KEVINDEBUG model_inp: tensor([[5326]]) causal_mask: tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
353 |
-
# 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
|
354 |
-
#KEVINDEBUG output_tokens: tensor([[ 8420, 374, 264, 32794, 911, 60249, 11, 17689, 13, 1084,
|
355 |
-
# 374, 264, 32794, 911, 279, 3283, 315, 60249, 11, 17689,
|
356 |
-
# 13, 576, 32794, 374, 5326, 304]])
|
357 |
-
#input : Here is a poem about Valencia output: Here is a poem about Valencia, Spain. It is a poem about the city of Valencia, Spain. The poem is written in
|
|
|
226 |
max_context_size: int = 2048
|
227 |
torch_model = StatefulLlamaForCausalLM(MODEL_ID, max_context_size=max_context_size)
|
228 |
torch_model.eval()
|
229 |
+
|
230 |
+
from transformers import AutoTokenizer
|
231 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
232 |
+
#initial_prompt = "Write a christmas Carol"
|
233 |
+
initial_prompt = "Write a poem on Apple "
|
234 |
+
|
235 |
+
input_ids = tokenizer(initial_prompt, return_tensors='pt').input_ids
|
236 |
+
causal_mask: torch.Tensor = torch.ones((1, 1, 1, input_ids.shape[-1] + 1), dtype=torch.float32)
|
237 |
+
|
238 |
traced_model = torch.jit.trace(torch_model, [input_ids, causal_mask])
|
239 |
|
240 |
# Convert traced TorchScript to Core ML format
|
|
|
269 |
minimum_deployment_target=ct.target.iOS18,
|
270 |
skip_model_load=True,
|
271 |
)
|
272 |
+
#mlmodel_fp16._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
273 |
+
#mlmodel_fp16.save("Stateful_llama_3_1_8B_InstructFP16.mlpackage")
|
274 |
|
275 |
# Block-wise quantize model weights to int4
|
276 |
op_config = ct.optimize.coreml.OpLinearQuantizerConfig(
|
|
|
282 |
config = ct.optimize.coreml.OptimizationConfig(global_config=op_config)
|
283 |
mlmodel_int4 = ct.optimize.coreml.linear_quantize_weights(mlmodel_fp16, config=config)
|
284 |
mlmodel_int4._spec.description.metadata.userDefined.update({METADATA_TOKENIZER: MODEL_ID})
|
285 |
+
mlmodel_int4.save("Stateful_llama_3_1_8B_InstructInt4.mlpackage")
|
286 |
|
287 |
|
288 |
if __name__ == "__main__":
|
289 |
+
export()
|
290 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|