Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException, Request
|
2 |
from pydantic import BaseModel
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5 |
-
import uvicorn
|
6 |
import re
|
|
|
7 |
from spaces import GPU
|
8 |
-
|
9 |
-
app = FastAPI()
|
10 |
|
11 |
global_data = {
|
12 |
'models': {},
|
@@ -44,6 +42,7 @@ model_configs = [
|
|
44 |
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
|
45 |
]
|
46 |
|
|
|
47 |
class ModelManager:
|
48 |
def __init__(self):
|
49 |
self.models = {}
|
@@ -83,7 +82,7 @@ def remove_duplicates(text):
|
|
83 |
seen_lines.add(line)
|
84 |
return '\n'.join(unique_lines)
|
85 |
|
86 |
-
@GPU(duration=0)
|
87 |
def generate_model_response(model, inputs):
|
88 |
try:
|
89 |
response = model(inputs)
|
@@ -92,30 +91,36 @@ def generate_model_response(model, inputs):
|
|
92 |
print(f"Error generating model response: {e}")
|
93 |
return ""
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
@app.middleware("http")
|
112 |
-
async def process_request(request: Request, call_next):
|
113 |
-
try:
|
114 |
-
response = await call_next(request)
|
115 |
-
return response
|
116 |
-
except Exception as e:
|
117 |
-
print(f"Request error: {e}")
|
118 |
-
raise HTTPException(status_code=500, detail="Internal Server Error")
|
119 |
|
120 |
def remove_repetitive_responses(responses):
|
121 |
unique_responses = {}
|
@@ -125,4 +130,5 @@ def remove_repetitive_responses(responses):
|
|
125 |
return unique_responses
|
126 |
|
127 |
if __name__ == "__main__":
|
128 |
-
|
|
|
|
|
|
1 |
from pydantic import BaseModel
|
2 |
from llama_cpp import Llama
|
3 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
4 |
import re
|
5 |
+
import httpx
|
6 |
from spaces import GPU
|
7 |
+
import asyncio
|
|
|
8 |
|
9 |
global_data = {
|
10 |
'models': {},
|
|
|
42 |
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
|
43 |
]
|
44 |
|
45 |
+
|
46 |
class ModelManager:
|
47 |
def __init__(self):
|
48 |
self.models = {}
|
|
|
82 |
seen_lines.add(line)
|
83 |
return '\n'.join(unique_lines)
|
84 |
|
85 |
+
@GPU(duration=0)
|
86 |
def generate_model_response(model, inputs):
|
87 |
try:
|
88 |
response = model(inputs)
|
|
|
91 |
print(f"Error generating model response: {e}")
|
92 |
return ""
|
93 |
|
94 |
+
async def handle_request(request):
|
95 |
+
if request.method == "POST" and request.url.path == "/generate":
|
96 |
+
try:
|
97 |
+
chat_request = ChatRequest(**request.json())
|
98 |
+
inputs = normalize_input(chat_request.message)
|
99 |
+
with ThreadPoolExecutor() as executor:
|
100 |
+
futures = [
|
101 |
+
executor.submit(generate_model_response, model, inputs)
|
102 |
+
for model in global_data['models'].values()
|
103 |
+
]
|
104 |
+
responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(global_data['models'].keys(), as_completed(futures))]
|
105 |
+
unique_responses = remove_repetitive_responses(responses)
|
106 |
+
return httpx.Response(status_code=200, json=unique_responses)
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Error handling request: {e}")
|
109 |
+
return httpx.Response(status_code=500, json={"error": f"Error handling request: {e}"})
|
110 |
+
|
111 |
+
else:
|
112 |
+
return httpx.Response(status_code=404, text="Not Found")
|
113 |
+
|
114 |
+
|
115 |
+
async def run_server(port: int):
|
116 |
+
async with httpx.AsyncClient(base_url=f"http://localhost:{port}") as client:
|
117 |
+
while True:
|
118 |
+
request = await client.get("/") # You might need to adjust this based on your expected requests
|
119 |
+
response = await handle_request(request)
|
120 |
+
print(f"Received request: {request}")
|
121 |
+
print(f"Sending response: {response}")
|
122 |
+
await asyncio.sleep(1) # Adjust the sleep duration as needed
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
def remove_repetitive_responses(responses):
|
126 |
unique_responses = {}
|
|
|
130 |
return unique_responses
|
131 |
|
132 |
if __name__ == "__main__":
|
133 |
+
port = 7860
|
134 |
+
asyncio.run(run_server(port))
|