File size: 8,524 Bytes
9883ddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from langchain import LLMChain
from langchain.llms import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import uvicorn
from dotenv import load_dotenv
import io
import requests
import asyncio
import time

# Cargar variables de entorno
load_dotenv()

# Inicializar aplicaci贸n FastAPI
app = FastAPI()

# Configuraci贸n de los modelos
model_configs = [
    {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
    {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
    {"repo_id": "Ffftdtd5dtft/starcoder2-15b-Q2_K-GGUF", "filename": "starcoder2-15b-q2_k.gguf", "name": "Starcoder2 15B"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-2b-it-Q2_K-GGUF", "filename": "gemma-2-2b-it-q2_k.gguf", "name": "Gemma 2-2B IT"},
    {"repo_id": "Ffftdtd5dtft/sarvam-2b-v0.5-Q2_K-GGUF", "filename": "sarvam-2b-v0.5-q2_k.gguf", "name": "Sarvam 2B v0.5"},
    {"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM 13B Uncensored"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2 Math 72B Instruct"},
    {"repo_id": "Ffftdtd5dtft/WizardLM-7B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-7b-uncensored-q2_k.gguf", "name": "WizardLM 7B Uncensored"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-Math-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-7b-instruct-q2_k.gguf", "name": "Qwen2 Math 7B Instruct"}
]

# Clase para gestionar modelos
class ModelManager:
    def __init__(self):
        self.models = []
        self.configs = {}

    async def download_model_to_memory(self, model_config):
        print(f"Descargando modelo: {model_config['name']}...")
        url = f"https://huggingface.co./{model_config['repo_id']}/resolve/main/{model_config['filename']}"
        response = requests.get(url)
        if response.status_code == 200:
            model_file = io.BytesIO(response.content)
            return model_file
        else:
            raise Exception(f"Error al descargar el modelo: {response.status_code}")

    async def load_model(self, model_config):
        try:
            start_time = time.time()
            model_file = await self.download_model_to_memory(model_config)
            print(f"Cargando modelo: {model_config['name']}...")
            
            # Simulaci贸n de divisi贸n de carga si el tiempo excede 1 segundo
            async def load_part(part):
                # Esta funci贸n simula la carga de una parte del modelo
                await asyncio.sleep(0.1)  # Simula un peque帽o retraso en la carga

            # Se divide la carga en partes si excede 1 segundo
            if time.time() - start_time > 1:
                print(f"Modelo {model_config['name']} tard贸 m谩s de 1 segundo en cargarse, dividiendo la carga...")
                await asyncio.gather(*(load_part(part) for part in range(5)))  # Simulaci贸n de divisi贸n en 5 partes
            else:
                model = await asyncio.get_event_loop().run_in_executor(
                    None,
                    lambda: Llama.from_pretrained(model_file)
                )
            
            model = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: Llama.from_pretrained(model_file)
            )
            tokenizer = model.tokenizer

            # Almacenar tokens y tokenizer en la RAM
            model_data = {
                'model': model,
                'tokenizer': tokenizer,
                'pad_token': tokenizer.pad_token,
                'pad_token_id': tokenizer.pad_token_id,
                'eos_token': tokenizer.eos_token,
                'eos_token_id': tokenizer.eos_token_id,
                'bos_token': tokenizer.bos_token,
                'bos_token_id': tokenizer.bos_token_id,
                'unk_token': tokenizer.unk_token,
                'unk_token_id': tokenizer.unk_token_id
            }
            
            self.models.append({"model_data": model_data, "name": model_config['name']})
        except Exception as e:
            print(f"Error al cargar el modelo: {e}")

    async def load_all_models(self):
        print("Iniciando carga de modelos...")
        start_time = time.time()
        tasks = [self.load_model(config) for config in model_configs]
        await asyncio.gather(*tasks)
        end_time = time.time()
        print(f"Todos los modelos han sido cargados en {end_time - start_time:.2f} segundos.")

# Instanciar ModelManager y cargar modelos
model_manager = ModelManager()

@app.on_event("startup")
async def startup_event():
    await model_manager.load_all_models()

# Modelo global para la solicitud de chat
class ChatRequest(BaseModel):
    message: str
    top_k: int = 50
    top_p: float = 0.95
    temperature: float = 0.7

# L铆mite de tokens para respuestas
TOKEN_LIMIT = 1000  # Define el l铆mite de tokens permitido por respuesta

# Funci贸n para generar respuestas de chat
async def generate_chat_response(request, model_data):
    try:
        user_input = normalize_input(request.message)
        llm = model_data['model_data']['model']
        tokenizer = model_data['model_data']['tokenizer']
        
        # Generar respuesta de manera r谩pida
        response = await asyncio.get_event_loop().run_in_executor(
            None,
            lambda: llm(user_input, max_length=TOKEN_LIMIT, do_sample=True, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
        )
        generated_text = response['generated_text']
        # Dividir respuesta larga
        split_response = split_long_response(generated_text)
        return {"response": split_response, "literal": user_input, "model_name": model_data['name']}
    except Exception as e:
        print(f"Error al generar la respuesta: {e}")
        return {"response": "Error al generar la respuesta", "literal": user_input, "model_name": model_data['name']}

def split_long_response(response):
    """ Divide la respuesta en partes m谩s peque帽as si excede el l铆mite de tokens. """
    parts = []
    while len(response) > TOKEN_LIMIT:
        part = response[:TOKEN_LIMIT]
        response = response[TOKEN_LIMIT:]
        parts.append(part.strip())
    if response:
        parts.append(response.strip())
    return '\n'.join(parts)

def remove_duplicates(text):
    """ Elimina duplicados en el texto. """
    lines = text.splitlines()
    unique_lines = list(dict.fromkeys(lines))
    return '\n'.join(unique_lines)

def remove_repetitive_responses(responses):
    unique_responses = []
    seen_responses = set()
    for response in responses:
        normalized_response = remove_duplicates(response['response'])
        if normalized_response not in seen_responses:
            seen_responses.add(normalized_response)
            response['response'] = normalized_response
            unique_responses.append(response)
    return unique_responses

@app.post("/chat")
async def chat(request: ChatRequest):
    results = []
    for model_data in model_manager.models:
        response = await generate_chat_response(request, model_data)
        results.append(response)
    unique_results = remove_repetitive_responses(results)
    return {"results": unique_results}

# Ejecutar la aplicaci贸n FastAPI
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)