souzat19
/

Llama3.1_fn14133.29122024

@@ -1,5 +1,5 @@
 from typing import Dict, Any
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 class EndpointHandler:
@@ -11,18 +11,26 @@ class EndpointHandler:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
         # Inicialização do modelo e tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name_or_path,
             trust_remote_code=True,
-            use_cache=True,
-            low_cpu_mem_usage=True
         )
-        # Move modelo para GPU se disponível
-        self.model = self.model.to(self.device)
         # Template do prompt no formato Alpaca
         self.prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
@@ -46,7 +54,15 @@ Você é um assistente especializado em planejamento de compras públicas de aco
             formatted_prompt = self.prompt_template.format(input=input_text)
             # Tokeniza o input
-            inputs = self.tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=4096)
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             # Gera a resposta
@@ -80,8 +96,6 @@ Você é um assistente especializado em planejamento de compras públicas de aco
         """
         Pré-processa o texto de entrada se necessário
         """
-        # Remove espaços extras e normaliza quebras de linha
-        text = " ".join(text.split())
         return text.strip()
     def validate_input(self, text: str) -> bool:

 from typing import Dict, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import torch
 class EndpointHandler:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
+        # Configurações para evitar quantização automática
+        config = AutoConfig.from_pretrained(self.model_name_or_path)
         # Inicialização do modelo e tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name_or_path,
+            trust_remote_code=True
+        )
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name_or_path,
+            config=config,
+            torch_dtype=torch.float32,  # Força o uso de float32
+            device_map="auto",
             trust_remote_code=True,
+            use_safetensors=True,
+            load_in_4bit=False,
+            load_in_8bit=False
         )
         # Template do prompt no formato Alpaca
         self.prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
             formatted_prompt = self.prompt_template.format(input=input_text)
             # Tokeniza o input
+            inputs = self.tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=4096,
+                add_special_tokens=True
+            )
+            # Move para o dispositivo apropriado
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             # Gera a resposta
         """
         Pré-processa o texto de entrada se necessário
         """
         return text.strip()
     def validate_input(self, text: str) -> bool: