Load quantized

#2
by yoeldcd - opened

How configure pipeline function parameters to load quantized model version from Hub?

Hugging Face TB Research org
edited Aug 18

To use the onnx version you can do it like this

from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
import torch
import os

# Specify the local path to your cloned repository
# git clone https://huggingface.co./HuggingFaceTB/SmolLM-360M-Instruct
repo_path = "path/to/local/repo"

# cp repo_path/onnx/model.onnx repo_path/model.onnx -> moove the onnx model you want to the main folder

# Load the tokenizer and model from local paths
tokenizer = AutoTokenizer.from_pretrained(repo_path)
model = ORTModelForCausalLM.from_pretrained(repo_path)

# Prepare the input using the chat template
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "What is the capital of France ?"}
]

# Apply the chat template
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Generate the response
inputs = tokenizer(input_text, return_tensors="pt")
gen_tokens = model.generate(**inputs, do_sample=True, temperature=0.2, top_p=0.9, min_length=20, max_length=100)

# Decode and print the output
output = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
print(output[0])
``

Sign up or log in to comment