Load quantized
#2
by
yoeldcd
- opened
How configure pipeline function parameters to load quantized model version from Hub?
To use the onnx version you can do it like this
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
import torch
import os
# Specify the local path to your cloned repository
# git clone https://huggingface.co./HuggingFaceTB/SmolLM-360M-Instruct
repo_path = "path/to/local/repo"
# cp repo_path/onnx/model.onnx repo_path/model.onnx -> moove the onnx model you want to the main folder
# Load the tokenizer and model from local paths
tokenizer = AutoTokenizer.from_pretrained(repo_path)
model = ORTModelForCausalLM.from_pretrained(repo_path)
# Prepare the input using the chat template
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the capital of France ?"}
]
# Apply the chat template
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Generate the response
inputs = tokenizer(input_text, return_tensors="pt")
gen_tokens = model.generate(**inputs, do_sample=True, temperature=0.2, top_p=0.9, min_length=20, max_length=100)
# Decode and print the output
output = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
print(output[0])
``