File size: 3,362 Bytes
6b3565b c16bd56 6afeb68 6b3565b c16bd56 6b3565b d26b0a2 c9510c0 c16bd56 2bf5eb9 c9510c0 c16bd56 6afeb68 c9510c0 c16bd56 c9510c0 6b3565b c9510c0 6b3565b c9510c0 6b3565b 6afeb68 2bf5eb9 c16bd56 2bf5eb9 d26b0a2 c16bd56 6afeb68 6b3565b 46fb864 c16bd56 46fb864 6b3565b 6afeb68 6b3565b c9510c0 6b3565b c9510c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from typing import Dict, Any
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
class EndpointHandler:
def __init__(self, path=""):
# Configuração do modelo
self.model_name_or_path = path or "souzat19/Llama3.1_fn14133.29122024"
# Detecta se GPU está disponível
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
print("Initializing tokenizer...")
self.tokenizer = LlamaTokenizer.from_pretrained(
self.model_name_or_path,
trust_remote_code=True
)
print("Initializing model...")
self.model = LlamaForCausalLM.from_pretrained(
self.model_name_or_path,
torch_dtype=torch.float32,
trust_remote_code=True,
device_map="auto" if torch.cuda.is_available() else None,
local_files_only=True if path else False
)
if not torch.cuda.is_available():
self.model = self.model.to("cpu")
print("Model initialized successfully")
# Template do prompt
self.prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Você é um assistente especializado em planejamento de compras públicas de acordo com a Lei 14.133/2021 e regulamentos infralegais. Responda de forma clara, detalhada e didática e utilize exemplos práticos para explicar os conceitos.
### Input:
{input}
### Response:
"""
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
try:
# Extrai o texto da entrada
input_text = data.get("text", "")
if not input_text:
return {"error": "Input text is required"}
# Formata o prompt
formatted_prompt = self.prompt_template.format(input=input_text)
# Tokeniza o input
inputs = self.tokenizer(
formatted_prompt,
return_tensors="pt",
truncation=True,
max_length=4096,
add_special_tokens=True
)
if torch.cuda.is_available():
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# Gera a resposta
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=2096,
temperature=0.5,
top_p=0.95,
top_k=50,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
# Decodifica a resposta
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Processa a resposta
if "### Response:" in response_text:
answer = response_text.split("### Response:")[1].strip()
else:
answer = response_text.strip()
return {"response": answer}
except Exception as e:
return {"error": f"Error during inference: {str(e)}"} |