Spaces:
Runtime error
Runtime error
import gradio as gr | |
import spaces | |
import logging | |
import torch | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import whisper | |
from huggingface_hub import login | |
from pydub import AudioSegment | |
import requests | |
from bs4 import BeautifulSoup | |
from typing import Optional, Dict, Any | |
import fitz # PyMuPDF | |
import os | |
# Configuración de logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
# Obtener token de Hugging Face | |
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN') | |
if not HUGGINGFACE_TOKEN: | |
logger.warning("HUGGINGFACE_TOKEN no encontrado en variables de entorno") | |
raise ValueError("Configura HUGGINGFACE_TOKEN en las variables de entorno") | |
# Autenticación en Hugging Face | |
login(token=HUGGINGFACE_TOKEN) | |
class NewsGenerator: | |
def __init__(self): | |
self.device = "cpu" # Forzar uso de CPU | |
self.whisper_model = None | |
self.llm_model = None | |
self.tokenizer = None | |
self._load_models() | |
def _load_models(self): | |
"""Carga optimizada de modelos para CPU""" | |
try: | |
# Modelo DeepSeek ligero | |
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
model_name, | |
use_fast=True, | |
token=HUGGINGFACE_TOKEN | |
) | |
self.llm_model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map="cpu", | |
torch_dtype=torch.float32, # Usar float32 para CPU | |
low_cpu_mem_usage=True, | |
token=HUGGINGFACE_TOKEN | |
) | |
# Configuración de Whisper (versión reducida) | |
self.whisper_model = whisper.load_model( | |
"tiny.en", | |
device="cpu" | |
) | |
except Exception as e: | |
logger.error(f"Error cargando modelos: {str(e)}") | |
raise | |
def transcribe_audio(self, audio_path: str) -> str: | |
"""Transcripción de audio con manejo de errores""" | |
try: | |
result = self.whisper_model.transcribe(audio_path) | |
return result.get("text", "") | |
except Exception as e: | |
logger.error(f"Error en transcripción: {str(e)}") | |
return "" | |
def generate_news(self, prompt: str, max_length: int = 512) -> str: | |
"""Generación de noticias con DeepSeek""" | |
try: | |
# Formato de prompt específico para DeepSeek | |
formatted_prompt = ( | |
f"<|System|>\nEres un periodista profesional. Genera un artículo noticioso " | |
f"basado en estos datos:\n{prompt}\n<|End|>\n" | |
f"<|User|>\nRedacta el artículo:<|End|>\n<|Assistant|>" | |
) | |
inputs = self.tokenizer( | |
formatted_prompt, | |
return_tensors="pt" | |
).to(self.device) | |
outputs = self.llm_model.generate( | |
**inputs, | |
max_new_tokens=max_length, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True, | |
pad_token_id=self.tokenizer.eos_token_id | |
) | |
return self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
except Exception as e: | |
logger.error(f"Error en generación: {str(e)}") | |
return "Error generando el artículo" | |
def read_document(file_path: str) -> str: | |
"""Lectura optimizada de documentos""" | |
try: | |
if file_path.endswith(".pdf"): | |
with fitz.open(file_path) as doc: | |
return " ".join(page.get_text() for page in doc) | |
elif file_path.endswith(".docx"): | |
from docx import Document | |
return " ".join(p.text for p in Document(file_path).paragraphs) | |
elif file_path.endswith(".xlsx"): | |
import pandas as pd | |
return pd.read_excel(file_path).to_string() | |
elif file_path.endswith(".csv"): | |
import pandas as pd | |
return pd.read_csv(file_path).to_string() | |
return "" | |
except Exception as e: | |
logger.error(f"Error leyendo documento: {str(e)}") | |
return "" | |
def read_url(url: str) -> str: | |
"""Extracción de contenido web""" | |
try: | |
response = requests.get(url, timeout=15) | |
response.raise_for_status() | |
return BeautifulSoup(response.content, 'html.parser').get_text(separator=' ', strip=True) | |
except Exception as e: | |
logger.error(f"Error leyendo URL: {str(e)}") | |
return "" | |
def process_social_media(url: str) -> Dict[str, Any]: | |
"""Procesamiento de redes sociales""" | |
try: | |
text = read_url(url) | |
return {"text": text, "video": None} | |
except Exception as e: | |
logger.error(f"Error procesando red social: {str(e)}") | |
return {"text": "", "video": None} | |
def create_interface(): | |
"""Interfaz de usuario de Gradio""" | |
generator = NewsGenerator() | |
with gr.Blocks(title="Generador de Noticias AI", theme=gr.themes.Soft()) as app: | |
gr.Markdown(""" | |
# 📰 Generador de Noticias AI | |
Transforma datos en bruto en artículos periodísticos profesionales usando IA avanzada. | |
### Características: | |
- Procesamiento multi-fuente (texto, documentos, audio, web) | |
- Estilos periodísticos profesionales | |
- Transcripción automática de audio | |
- Longitud y tono personalizables | |
--- | |
Desarrollado por Camilo Vega, Consultor en IA | |
[Perfil de LinkedIn](https://www.linkedin.com/in/camilo-vega-169084b1/) | |
""") | |
with gr.Row(): | |
with gr.Column(scale=3): | |
main_input = gr.Textbox( | |
label="Tema Principal", | |
placeholder="Ingrese el tema principal o instrucciones...", | |
lines=3 | |
) | |
additional_data = gr.Textbox( | |
label="Datos Adicionales", | |
placeholder="Hechos clave, nombres, fechas...", | |
lines=3 | |
) | |
with gr.Accordion("Fuentes Adicionales", open=False): | |
doc_upload = gr.File( | |
label="Subir Documento", | |
file_types=[".pdf", ".docx", ".xlsx", ".csv"] | |
) | |
audio_upload = gr.File( | |
label="Subir Audio/Video", | |
file_types=["audio", "video"] | |
) | |
url_input = gr.Textbox( | |
label="URL de Referencia", | |
placeholder="https://..." | |
) | |
social_input = gr.Textbox( | |
label="URL de Red Social", | |
placeholder="https://..." | |
) | |
length_slider = gr.Slider( | |
100, 1000, value=400, | |
label="Longitud del Artículo (palabras)" | |
) | |
tone_select = gr.Dropdown( | |
label="Tono Periodístico", | |
choices=["Formal", "Neutral", "Investigativo", "Narrativo"], | |
value="Neutral" | |
) | |
with gr.Column(scale=2): | |
output_news = gr.Textbox( | |
label="Artículo Generado", | |
lines=18, | |
interactive=False | |
) | |
generate_btn = gr.Button("Generar Artículo", variant="primary") | |
status = gr.Textbox(label="Estado", interactive=False) | |
def process_and_generate( | |
main_input: str, | |
additional_data: str, | |
document: Optional[str], | |
audio: Optional[str], | |
url: Optional[str], | |
social_url: Optional[str], | |
length: int, | |
tone: str | |
): | |
try: | |
# Procesar fuentes adicionales | |
doc_content = read_document(document) if document else "" | |
audio_content = generator.transcribe_audio(audio) if audio else "" | |
url_content = read_url(url) if url else "" | |
social_content = process_social_media(social_url) if social_url else {"text": ""} | |
# Construir prompt estructurado | |
prompt = f""" | |
## Instrucciones: | |
- Tema Principal: {main_input} | |
- Datos Proporcionados: {additional_data} | |
- Tono Requerido: {tone} | |
## Fuentes: | |
- Documento: {doc_content[:500]}... | |
- Audio: {audio_content[:300]}... | |
- URL: {url_content[:500]}... | |
- Red Social: {social_content['text'][:300]}... | |
## Requisitos: | |
- Estructura profesional (titular, lead, cuerpo) | |
- Incluir las 5W | |
- Citas relevantes si aplica | |
- Longitud: {length} palabras | |
""" | |
return generator.generate_news(prompt, length), "✅ Generación exitosa" | |
except Exception as e: | |
logger.error(str(e)) | |
return f"Error: {str(e)}", "❌ Error en generación" | |
generate_btn.click( | |
fn=process_and_generate, | |
inputs=[ | |
main_input, | |
additional_data, | |
doc_upload, | |
audio_upload, | |
url_input, | |
social_input, | |
length_slider, | |
tone_select | |
], | |
outputs=[output_news, status] | |
) | |
return app | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch( | |
share=True | |
) | |