from typing import Dict, Any from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel, PeftConfig import torch import time class EndpointHandler: def __init__(self, path="samadeniyi/lora_lesson_plan_model"): try: config = PeftConfig.from_pretrained(path) except TypeError as e: print(f"Error while loading config: {e}") # Manually filter out any unsupported config parameters (e.g., 'layer_replication') config_dict = PeftConfig.from_pretrained(path).__dict__ if "layer_replication" in config_dict: del config_dict["layer_replication"] config = PeftConfig(**config_dict) # Define 4-bit quantization configuration (this is necessary for low-memory usage) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) # Load the model using 4-bit quantization and optimized settings self.model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, return_dict=True, device_map={"": 0}, # Map to CUDA device 0 trust_remote_code=True, quantization_config=bnb_config, ) # Load tokenizer and ensure it matches the model self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) self.tokenizer.pad_token = self.tokenizer.eos_token # Apply PEFT (Parameter-Efficient Fine-Tuning) to the model self.model = PeftModel.from_pretrained(self.model, path) def __call__(self, data: Any) -> Dict[str, Any]: """ Args: data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}: - "instruction": The instruction describing what to generate. - "input": Context to guide the generation. Returns: A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}: - "generated_text": The generated text based on the input. - "time": The time taken to generate the output. """ # Parse input data inputs = data.pop("inputs", data) instruction = inputs.get("instruction", "") input_context = inputs.get("input", "") # Create the lesson plan prompt based on your preparation format lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input_context} ### Response: """ # Tokenize the prompt batch = self.tokenizer( lesson_prompt, padding=True, truncation=True, return_tensors='pt' ) batch = batch.to('cuda:0') # Configure generation settings generation_config = self.model.generation_config generation_config.top_p = 0.7 generation_config.temperature = 0.7 generation_config.max_new_tokens = 256 generation_config.num_return_sequences = 1 generation_config.pad_token_id = self.tokenizer.eos_token_id generation_config.eos_token_id = self.tokenizer.eos_token_id # Time the prediction start = time.time() with torch.cuda.amp.autocast(): output_tokens = self.model.generate( input_ids=batch.input_ids, generation_config=generation_config, ) end = time.time() # Decode generated tokens into text generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True) # Return the generated text and the time taken return {"generated_text": generated_text, "time": f"{(end-start):.2f} s"}