Spaces:
Runtime error
Runtime error
File size: 5,268 Bytes
c069edf 6133a63 7fa4c88 05c34a8 9f559e5 fadc2ea b9d94dc fadc2ea 6133a63 c069edf c9eef99 b9d94dc c9eef99 c069edf b9d94dc c9eef99 7fa4c88 b9d94dc 7fa4c88 e4165c8 4f21ff8 e4165c8 05c34a8 e4165c8 db2e73b e4165c8 b9d94dc 4f21ff8 6133a63 05c34a8 9fd4d92 6133a63 05c34a8 7fa4c88 b9d94dc 4f21ff8 f8c3935 6133a63 f8c3935 e372f0d 7fa4c88 9f559e5 6133a63 b9d94dc 9533a0b cf4d675 e4165c8 db2e73b 5cd11f8 c069edf b9d94dc fadc2ea db2e73b b9d94dc fadc2ea b9d94dc 9f559e5 506360c b9d94dc 506360c b9d94dc 506360c 9f559e5 365f24d c3629a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import gc
import tempfile
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import gradio as gr
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
from tqdm import tqdm
from dotenv import load_dotenv
from functools import lru_cache
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.system("pip install llama-cpp-python")
app = FastAPI()
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
global_data = {
'model_configs': [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
]
}
class ModelManager:
def __init__(self):
self.models = {}
self.load_all_models()
def load_all_models(self):
with ThreadPoolExecutor(max_workers=len(global_data['model_configs'])) as executor:
futures = [executor.submit(self._load_model, config) for config in tqdm(global_data['model_configs'], desc="Loading models")]
for future in tqdm(as_completed(futures), total=len(global_data['model_configs']), desc="Loading models complete"):
future.result()
def _load_model(self, model_config):
model_name = model_config['name']
if model_name not in self.models:
try:
tempdir = tempfile.TemporaryDirectory()
filepath = os.path.join(tempdir.name, model_config['filename'])
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
self.models[model_name] = model
model.model.model_path = filepath
except Exception as e:
print(f"Error loading {model_name}: {e}")
self.models[model_name] = None
finally:
gc.collect()
def get_model(self, model_name):
return self.models.get(model_name)
model_manager = ModelManager()
class ChatRequest(BaseModel):
message: str
@lru_cache(maxsize=128)
def generate_model_response(model, inputs):
try:
response = model(inputs, max_tokens=150)
return response['choices'][0]['text']
except Exception as e:
return f"Error: Could not generate a response. Details: {e}"
async def process_message(message):
inputs = message.strip()
responses = {}
with ThreadPoolExecutor(max_workers=len(global_data['model_configs'])) as executor:
futures = [executor.submit(generate_model_response, model_manager.get_model(config['name']), inputs) for config in global_data['model_configs'] if model_manager.get_model(config['name'])]
for i, future in enumerate(tqdm(as_completed(futures), total=len([f for f in futures]), desc="Generating responses")):
model_name = global_data['model_configs'][i]['name']
responses[model_name] = future.result()
return "\n\n".join([f"**{model}:**\n{response}" for model, response in responses.items()])
@app.post("/generate_multimodel")
async def api_generate_multimodel(request: Request):
try:
data = await request.json()
message = data.get("message")
if not message:
raise HTTPException(status_code=400, detail="Missing message")
response = await process_message(message)
return JSONResponse({"response": response})
except HTTPException as e:
raise e
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
iface = gr.Interface(
fn=process_message,
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
outputs=gr.Markdown(),
title="Multi-Model LLM API",
description="Enter a message and get responses from multiple LLMs.",
live=False
)
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
iface.launch(server_port=port)
|