asasasText-servicekdjdjjd

Runtime error

App Files Files Community

Yhhxhfh commited on Sep 29

Commit

6133a63

•

1 Parent(s): c069edf

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -32

app.py CHANGED Viewed

@@ -1,26 +1,24 @@
 import os
-from pydantic import BaseModel
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor, as_completed
-import re
 import gradio as gr
-from dotenv import load_dotenv
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import JSONResponse
 from tqdm import tqdm
 from functools import lru_cache
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-# Instalar la librería llama-cpp-python
 os.system("pip install llama-cpp-python")
 app = FastAPI()
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-# Configuración de modelos globales
 global_data = {
     'model_configs': [
         {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
@@ -36,7 +34,6 @@ global_data = {
     ]
 }
-# Manejo de la carga de modelos
 class ModelManager:
     def __init__(self):
         self.models = {}
@@ -52,10 +49,16 @@ class ModelManager:
         model_name = model_config['name']
         if model_name not in self.models:
             try:
-                self.models[model_name] = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
             except Exception as e:
                 print(f"Error loading {model_name}: {e}")
                 self.models[model_name] = None
     def get_model(self, model_name):
         return self.models.get(model_name)
@@ -65,37 +68,16 @@ model_manager = ModelManager()
 class ChatRequest(BaseModel):
     message: str
-# Normalización de entrada
-def normalize_input(input_text):
-    return input_text.strip()
-# Eliminación de duplicados en la respuesta
-def remove_duplicates(text):
-    text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you?', text)
-    text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you?', text)
-    text = text.replace('[/INST]', '')
-    lines = text.split('\n')
-    unique_lines = []
-    seen_lines = set()
-    for line in lines:
-        if line not in seen_lines:
-            unique_lines.append(line)
-            seen_lines.add(line)
-    return '\n'.join(unique_lines)
-# Generación de respuesta de modelo
 @lru_cache(maxsize=128)
 def generate_model_response(model, inputs):
     try:
         response = model(inputs, max_tokens=150)
-        return remove_duplicates(response['choices'][0]['text'])
     except Exception as e:
-        print(f"Error generating response: {e}")
         return f"Error: Could not generate a response. Details: {e}"
-# Procesamiento del mensaje
 async def process_message(message):
-    inputs = normalize_input(message)
     responses = {}
     with ThreadPoolExecutor(max_workers=len(global_data['model_configs'])) as executor:
@@ -120,7 +102,6 @@ async def api_generate_multimodel(request: Request):
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
-# Interfaz Gradio
 iface = gr.Interface(
     fn=process_message,
     inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
@@ -130,7 +111,6 @@ iface = gr.Interface(
     live=False
 )
-# Lanzar servidor
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
     iface.launch(server_port=port)

 import os
+import gc
+import tempfile
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import gradio as gr
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import JSONResponse
 from tqdm import tqdm
+from dotenv import load_dotenv
 from functools import lru_cache
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 os.system("pip install llama-cpp-python")
 app = FastAPI()
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 global_data = {
     'model_configs': [
         {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
     ]
 }
 class ModelManager:
     def __init__(self):
         self.models = {}
         model_name = model_config['name']
         if model_name not in self.models:
             try:
+                tempdir = tempfile.TemporaryDirectory()
+                filepath = os.path.join(tempdir.name, model_config['filename'])
+                model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
+                self.models[model_name] = model
+                model.model.model_path = filepath
             except Exception as e:
                 print(f"Error loading {model_name}: {e}")
                 self.models[model_name] = None
+            finally:
+                gc.collect()
     def get_model(self, model_name):
         return self.models.get(model_name)
 class ChatRequest(BaseModel):
     message: str
 @lru_cache(maxsize=128)
 def generate_model_response(model, inputs):
     try:
         response = model(inputs, max_tokens=150)
+        return response['choices'][0]['text']
     except Exception as e:
         return f"Error: Could not generate a response. Details: {e}"
 async def process_message(message):
+    inputs = message.strip()
     responses = {}
     with ThreadPoolExecutor(max_workers=len(global_data['model_configs'])) as executor:
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
 iface = gr.Interface(
     fn=process_message,
     inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
     live=False
 )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
     iface.launch(server_port=port)