from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch class EndpointHandler: def __init__(self, path="unsloth/Meta-Llama-3.1-8B-bnb-4bit"): # Load model and tokenizer self.tokenizer = AutoTokenizer.from_pretrained(path) base_model = AutoModelForCausalLM.from_pretrained(path) self.model = PeftModel.from_pretrained(base_model, path) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) def __call__(self, data): # Extract input text input_text = data.get("inputs", {}).get("text", "") # Tokenize input text inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device) # Generate output output_tokens = self.model.generate(inputs["input_ids"], max_length=1024) # Decode generated tokens generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True) return {"generated_text": generated_text}