Spaces:

CamiloVega
/

News_AI

Runtime error

App Files Files Community

News_AI / app.py

CamiloVega

Update app.py

95861f9 verified 4 days ago

raw

history blame

10 kB

	import gradio as gr
	import spaces
	import logging
	import torch
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import whisper
	from huggingface_hub import login
	from pydub import AudioSegment
	import requests
	from bs4 import BeautifulSoup
	from typing import Optional, Dict, Any
	import fitz # PyMuPDF
	import os

	# Configuración de logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Obtener token de Hugging Face
	HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
	if not HUGGINGFACE_TOKEN:
	logger.warning("HUGGINGFACE_TOKEN no encontrado en variables de entorno")
	raise ValueError("Configura HUGGINGFACE_TOKEN en las variables de entorno")

	# Autenticación en Hugging Face
	login(token=HUGGINGFACE_TOKEN)

	class NewsGenerator:
	def __init__(self):
	self.device = "cpu" # Forzar uso de CPU
	self.whisper_model = None
	self.llm_model = None
	self.tokenizer = None

	self._load_models()

	def _load_models(self):
	"""Carga optimizada de modelos para CPU"""
	try:
	# Modelo DeepSeek ligero
	model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	use_fast=True,
	token=HUGGINGFACE_TOKEN
	)

	self.llm_model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="cpu",
	torch_dtype=torch.float32, # Usar float32 para CPU
	low_cpu_mem_usage=True,
	token=HUGGINGFACE_TOKEN
	)

	# Configuración de Whisper (versión reducida)
	self.whisper_model = whisper.load_model(
	"tiny.en",
	device="cpu"
	)

	except Exception as e:
	logger.error(f"Error cargando modelos: {str(e)}")
	raise

	def transcribe_audio(self, audio_path: str) -> str:
	"""Transcripción de audio con manejo de errores"""
	try:
	result = self.whisper_model.transcribe(audio_path)
	return result.get("text", "")
	except Exception as e:
	logger.error(f"Error en transcripción: {str(e)}")
	return ""

	def generate_news(self, prompt: str, max_length: int = 512) -> str:
	"""Generación de noticias con DeepSeek"""
	try:
	# Formato de prompt específico para DeepSeek
	formatted_prompt = (
	f"<\|System\|>\nEres un periodista profesional. Genera un artículo noticioso "
	f"basado en estos datos:\n{prompt}\n<\|End\|>\n"
	f"<\|User\|>\nRedacta el artículo:<\|End\|>\n<\|Assistant\|>"
	)

	inputs = self.tokenizer(
	formatted_prompt,
	return_tensors="pt"
	).to(self.device)

	outputs = self.llm_model.generate(
	**inputs,
	max_new_tokens=max_length,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id
	)

	return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	except Exception as e:
	logger.error(f"Error en generación: {str(e)}")
	return "Error generando el artículo"

	def read_document(file_path: str) -> str:
	"""Lectura optimizada de documentos"""
	try:
	if file_path.endswith(".pdf"):
	with fitz.open(file_path) as doc:
	return " ".join(page.get_text() for page in doc)
	elif file_path.endswith(".docx"):
	from docx import Document
	return " ".join(p.text for p in Document(file_path).paragraphs)
	elif file_path.endswith(".xlsx"):
	import pandas as pd
	return pd.read_excel(file_path).to_string()
	elif file_path.endswith(".csv"):
	import pandas as pd
	return pd.read_csv(file_path).to_string()
	return ""
	except Exception as e:
	logger.error(f"Error leyendo documento: {str(e)}")
	return ""

	def read_url(url: str) -> str:
	"""Extracción de contenido web"""
	try:
	response = requests.get(url, timeout=15)
	response.raise_for_status()
	return BeautifulSoup(response.content, 'html.parser').get_text(separator=' ', strip=True)
	except Exception as e:
	logger.error(f"Error leyendo URL: {str(e)}")
	return ""

	def process_social_media(url: str) -> Dict[str, Any]:
	"""Procesamiento de redes sociales"""
	try:
	text = read_url(url)
	return {"text": text, "video": None}
	except Exception as e:
	logger.error(f"Error procesando red social: {str(e)}")
	return {"text": "", "video": None}

	def create_interface():
	"""Interfaz de usuario de Gradio"""
	generator = NewsGenerator()

	with gr.Blocks(title="Generador de Noticias AI", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 📰 Generador de Noticias AI

	Transforma datos en bruto en artículos periodísticos profesionales usando IA avanzada.

	### Características:
	- Procesamiento multi-fuente (texto, documentos, audio, web)
	- Estilos periodísticos profesionales
	- Transcripción automática de audio
	- Longitud y tono personalizables

	---
	Desarrollado por Camilo Vega, Consultor en IA
	[Perfil de LinkedIn](https://www.linkedin.com/in/camilo-vega-169084b1/)
	""")

	with gr.Row():
	with gr.Column(scale=3):
	main_input = gr.Textbox(
	label="Tema Principal",
	placeholder="Ingrese el tema principal o instrucciones...",
	lines=3
	)
	additional_data = gr.Textbox(
	label="Datos Adicionales",
	placeholder="Hechos clave, nombres, fechas...",
	lines=3
	)

	with gr.Accordion("Fuentes Adicionales", open=False):
	doc_upload = gr.File(
	label="Subir Documento",
	file_types=[".pdf", ".docx", ".xlsx", ".csv"]
	)
	audio_upload = gr.File(
	label="Subir Audio/Video",
	file_types=["audio", "video"]
	)
	url_input = gr.Textbox(
	label="URL de Referencia",
	placeholder="https://..."
	)
	social_input = gr.Textbox(
	label="URL de Red Social",
	placeholder="https://..."
	)

	length_slider = gr.Slider(
	100, 1000, value=400,
	label="Longitud del Artículo (palabras)"
	)
	tone_select = gr.Dropdown(
	label="Tono Periodístico",
	choices=["Formal", "Neutral", "Investigativo", "Narrativo"],
	value="Neutral"
	)

	with gr.Column(scale=2):
	output_news = gr.Textbox(
	label="Artículo Generado",
	lines=18,
	interactive=False
	)
	generate_btn = gr.Button("Generar Artículo", variant="primary")
	status = gr.Textbox(label="Estado", interactive=False)

	def process_and_generate(
	main_input: str,
	additional_data: str,
	document: Optional[str],
	audio: Optional[str],
	url: Optional[str],
	social_url: Optional[str],
	length: int,
	tone: str
	):
	try:
	# Procesar fuentes adicionales
	doc_content = read_document(document) if document else ""
	audio_content = generator.transcribe_audio(audio) if audio else ""
	url_content = read_url(url) if url else ""
	social_content = process_social_media(social_url) if social_url else {"text": ""}

	# Construir prompt estructurado
	prompt = f"""
	## Instrucciones:
	- Tema Principal: {main_input}
	- Datos Proporcionados: {additional_data}
	- Tono Requerido: {tone}

	## Fuentes:
	- Documento: {doc_content[:500]}...
	- Audio: {audio_content[:300]}...
	- URL: {url_content[:500]}...
	- Red Social: {social_content['text'][:300]}...

	## Requisitos:
	- Estructura profesional (titular, lead, cuerpo)
	- Incluir las 5W
	- Citas relevantes si aplica
	- Longitud: {length} palabras
	"""

	return generator.generate_news(prompt, length), "✅ Generación exitosa"

	except Exception as e:
	logger.error(str(e))
	return f"Error: {str(e)}", "❌ Error en generación"

	generate_btn.click(
	fn=process_and_generate,
	inputs=[
	main_input,
	additional_data,
	doc_upload,
	audio_upload,
	url_input,
	social_input,
	length_slider,
	tone_select
	],
	outputs=[output_news, status]
	)

	return app


	if __name__ == "__main__":
	app = create_interface()
	app.launch(
	share=True
	)