souzat19
/

Llama3.1_fn14133.29122024

@@ -1,26 +1,18 @@
 from typing import Dict, Any
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 class EndpointHandler:
     def __init__(self, path=""):
         # Configuração do modelo
         self.model_name_or_path = "souzat19/Llama3.1_fn14133.29122024"
-        self.model_basename = "unsloth.Q8_0.gguf"
-        # Download do modelo
-        model_path = hf_hub_download(
-            repo_id=self.model_name_or_path,
-            filename=self.model_basename
-        )
-        # Inicialização do modelo
-        self.model = Llama(
-            model_path=model_path,
-            n_threads=2,
-            n_batch=512,
-            n_gpu_layers=-1,
-            n_ctx=4096
         )
         # Template do prompt no formato Alpaca
@@ -46,25 +38,33 @@ Você é um assistente especializado em planejamento de compras públicas de aco
         try:
             # Extrai o texto da entrada
             input_text = data.get("text", "")
-            if not input_text:
                 return {"error": "Input text is required"}
             # Formata o prompt
             formatted_prompt = self.prompt_template.format(input=input_text)
             # Gera a resposta
-            response = self.model(
-                prompt=formatted_prompt,
-                max_tokens=2096,
                 temperature=0.5,
                 top_p=0.95,
                 top_k=50,
-                stop=['### Response:', '### Input:', '### Instruction:'],
-                echo=True
             )
-            # Extrai a resposta do modelo
-            response_text = response['choices'][0]['text']
             # Processa a resposta para extrair apenas a parte após "### Response:"
             if "### Response:" in response_text:

 from typing import Dict, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 class EndpointHandler:
     def __init__(self, path=""):
         # Configuração do modelo
         self.model_name_or_path = "souzat19/Llama3.1_fn14133.29122024"
+        # Inicialização do modelo e tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name_or_path,
+            torch_dtype=torch.float16,
+            device_map="auto"
         )
         # Template do prompt no formato Alpaca
         try:
             # Extrai o texto da entrada
             input_text = data.get("text", "")
+            if not input_text or not self.validate_input(input_text):
                 return {"error": "Input text is required"}
+            # Pré-processa o texto
+            input_text = self.preprocess(input_text)
             # Formata o prompt
             formatted_prompt = self.prompt_template.format(input=input_text)
+            # Tokeniza o input
+            inputs = self.tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=4096)
+            inputs = inputs.to(self.model.device)
             # Gera a resposta
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=2096,
                 temperature=0.5,
                 top_p=0.95,
                 top_k=50,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                do_sample=True
             )
+            # Decodifica a resposta
+            response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Processa a resposta para extrair apenas a parte após "### Response:"
             if "### Response:" in response_text: