Update app.py
Browse files
app.py
CHANGED
@@ -20,18 +20,25 @@ model_configs = [
|
|
20 |
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
|
21 |
]
|
22 |
|
23 |
-
# Cargar un modelo
|
24 |
def load_model(model_config):
|
|
|
25 |
return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
|
26 |
|
27 |
-
# Cargar todos los modelos simultáneamente
|
28 |
def load_all_models():
|
|
|
29 |
with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
|
30 |
futures = [executor.submit(load_model, config) for config in model_configs]
|
31 |
-
models = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
return models
|
33 |
|
34 |
-
# Cargar modelos en memoria
|
35 |
llms = load_all_models()
|
36 |
|
37 |
class ChatRequest(BaseModel):
|
@@ -40,7 +47,6 @@ class ChatRequest(BaseModel):
|
|
40 |
top_p: float = 0.95
|
41 |
temperature: float = 0.7
|
42 |
|
43 |
-
# Función para generar respuestas de chat
|
44 |
def generate_chat_response(request, llm):
|
45 |
try:
|
46 |
user_input = normalize_input(request.message)
|
@@ -72,13 +78,10 @@ def filter_duplicates(responses):
|
|
72 |
return unique_responses
|
73 |
|
74 |
def select_best_response(responses):
|
75 |
-
|
76 |
unique_responses = filter_duplicates(responses)
|
77 |
-
# Deduplicar respuestas
|
78 |
unique_responses = list(set(unique_responses))
|
79 |
-
# Filtrar respuestas coherentes
|
80 |
coherent_responses = filter_by_coherence(unique_responses)
|
81 |
-
# Seleccionar la mejor respuesta
|
82 |
best_response = filter_by_similarity(coherent_responses)
|
83 |
return best_response
|
84 |
|
@@ -97,6 +100,7 @@ def filter_by_similarity(responses):
|
|
97 |
return best_response
|
98 |
|
99 |
def worker_function(llm, request, progress_bar):
|
|
|
100 |
response = generate_chat_response(request, llm)
|
101 |
progress_bar.update(1)
|
102 |
return response
|
@@ -111,9 +115,7 @@ async def generate_chat(request: ChatRequest):
|
|
111 |
responses = []
|
112 |
num_models = len(llms)
|
113 |
|
114 |
-
# Crear barra de progreso
|
115 |
with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
|
116 |
-
# Ejecutar modelos en paralelo
|
117 |
with ThreadPoolExecutor(max_workers=num_models) as executor:
|
118 |
futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in llms]
|
119 |
for future in as_completed(futures):
|
@@ -123,7 +125,6 @@ async def generate_chat(request: ChatRequest):
|
|
123 |
except Exception as exc:
|
124 |
print(f"Error en la generación de respuesta: {exc}")
|
125 |
|
126 |
-
# Seleccionar la mejor respuesta
|
127 |
best_response = select_best_response(responses)
|
128 |
|
129 |
print(f"Mejor respuesta seleccionada: {best_response}")
|
|
|
20 |
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
|
21 |
]
|
22 |
|
|
|
23 |
def load_model(model_config):
|
24 |
+
print(f"Cargando modelo {model_config['repo_id']}...")
|
25 |
return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
|
26 |
|
|
|
27 |
def load_all_models():
|
28 |
+
print("Iniciando carga de modelos...")
|
29 |
with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
|
30 |
futures = [executor.submit(load_model, config) for config in model_configs]
|
31 |
+
models = []
|
32 |
+
for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
|
33 |
+
try:
|
34 |
+
model = future.result()
|
35 |
+
models.append(model)
|
36 |
+
print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Error al cargar el modelo: {e}")
|
39 |
+
print("Todos los modelos han sido cargados.")
|
40 |
return models
|
41 |
|
|
|
42 |
llms = load_all_models()
|
43 |
|
44 |
class ChatRequest(BaseModel):
|
|
|
47 |
top_p: float = 0.95
|
48 |
temperature: float = 0.7
|
49 |
|
|
|
50 |
def generate_chat_response(request, llm):
|
51 |
try:
|
52 |
user_input = normalize_input(request.message)
|
|
|
78 |
return unique_responses
|
79 |
|
80 |
def select_best_response(responses):
|
81 |
+
print("Filtrando respuestas...")
|
82 |
unique_responses = filter_duplicates(responses)
|
|
|
83 |
unique_responses = list(set(unique_responses))
|
|
|
84 |
coherent_responses = filter_by_coherence(unique_responses)
|
|
|
85 |
best_response = filter_by_similarity(coherent_responses)
|
86 |
return best_response
|
87 |
|
|
|
100 |
return best_response
|
101 |
|
102 |
def worker_function(llm, request, progress_bar):
|
103 |
+
print(f"Generando respuesta con el modelo...")
|
104 |
response = generate_chat_response(request, llm)
|
105 |
progress_bar.update(1)
|
106 |
return response
|
|
|
115 |
responses = []
|
116 |
num_models = len(llms)
|
117 |
|
|
|
118 |
with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
|
|
|
119 |
with ThreadPoolExecutor(max_workers=num_models) as executor:
|
120 |
futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in llms]
|
121 |
for future in as_completed(futures):
|
|
|
125 |
except Exception as exc:
|
126 |
print(f"Error en la generación de respuesta: {exc}")
|
127 |
|
|
|
128 |
best_response = select_best_response(responses)
|
129 |
|
130 |
print(f"Mejor respuesta seleccionada: {best_response}")
|