samdeniyi's picture
removed load_in_4bit from AutoModelForCausalLM
ba8119c
raw
history blame
4.05 kB
from typing import Dict, Any
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
import time
class EndpointHandler:
def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
try:
config = PeftConfig.from_pretrained(path)
except TypeError as e:
print(f"Error while loading config: {e}")
# Manually filter out any unsupported config parameters (e.g., 'layer_replication')
config_dict = PeftConfig.from_pretrained(path).__dict__
if "layer_replication" in config_dict:
del config_dict["layer_replication"]
config = PeftConfig(**config_dict)
# Define 4-bit quantization configuration (this is necessary for low-memory usage)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
# Load the model using 4-bit quantization and optimized settings
self.model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
device_map={"": 0}, # Map to CUDA device 0
trust_remote_code=True,
quantization_config=bnb_config,
)
# Load tokenizer and ensure it matches the model
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
self.tokenizer.pad_token = self.tokenizer.eos_token
# Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
self.model = PeftModel.from_pretrained(self.model, path)
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
- "instruction": The instruction describing what to generate.
- "input": Context to guide the generation.
Returns:
A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
- "generated_text": The generated text based on the input.
- "time": The time taken to generate the output.
"""
# Parse input data
inputs = data.pop("inputs", data)
instruction = inputs.get("instruction", "")
input_context = inputs.get("input", "")
# Create the lesson plan prompt based on your preparation format
lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input_context}
### Response:
"""
# Tokenize the prompt
batch = self.tokenizer(
lesson_prompt,
padding=True,
truncation=True,
return_tensors='pt'
)
batch = batch.to('cuda:0')
# Configure generation settings
generation_config = self.model.generation_config
generation_config.top_p = 0.7
generation_config.temperature = 0.7
generation_config.max_new_tokens = 256
generation_config.num_return_sequences = 1
generation_config.pad_token_id = self.tokenizer.eos_token_id
generation_config.eos_token_id = self.tokenizer.eos_token_id
# Time the prediction
start = time.time()
with torch.cuda.amp.autocast():
output_tokens = self.model.generate(
input_ids=batch.input_ids,
generation_config=generation_config,
)
end = time.time()
# Decode generated tokens into text
generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
# Return the generated text and the time taken
return {"generated_text": generated_text, "time": f"{(end-start):.2f} s"}