llama2-7b-chat-hf / handler.py
AIDSC's picture
Create handler.py
192aae2 verified
from typing import Dict, List, Any
import torch
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
def softmax(x):
z = x - max(x)
numerator = np.exp(z)
denominator = np.sum(numerator)
softmax = numerator/denominator
return softmax
class EndpointHandler():
def __init__(self, path=""):
self.accelerator = Accelerator()
self.device = self.accelerator.device
self.model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, device_map="auto")
self.model = self.accelerator.prepare(self.model)
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.options_tokens = [self.tokenizer.encode(choice)[-1] for choice in ["A", "B", "C", "D"]]
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
data args:
inputs (:obj: `str` | `PIL.Image` | `np.array`)
kwargss
Return:
A :obj:`list` | `dict`: will be serialized and returned
"""
with torch.no_grad():
prompt = data.pop("prompt")
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
input_size = inputs['input_ids'].size(1)
input_ids = inputs["input_ids"].to(self.device)
outputs = self.model(**inputs)
last_token_logits = outputs.logits[:, -1, :]
options_tokens_logits = last_token_logits[:, self.options_tokens].detach().cpu().numpy()
conf = softmax(options_tokens_logits[0])
pred = np.argmax(options_tokens_logits[0])
return [{"pred": pred, "conf":conf}]