Spaces:

CamiloVega
/

News_AI

Runtime error

App Files Files Community

CamiloVega commited on 4 days ago

Commit

4cfecb3

verified ·

1 Parent(s): d01ead7

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -61

app.py CHANGED Viewed

@@ -1,141 +1,250 @@
 import gradio as gr
 import logging
 import torch
-from transformers import pipeline, AutoTokenizer
 import whisper
 from pydub import AudioSegment
 import requests
 from bs4 import BeautifulSoup
-from typing import Optional
-# Configuración básica de logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 class NewsGenerator:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.whisper_model = None
-        self.news_pipeline = None
         self.tokenizer = None
-        # Carga diferida de modelos
         self._load_models()
     def _load_models(self):
-        """Carga eficiente de modelos con gestión de memoria"""
         try:
-            # Modelo de texto más pequeño y eficiente
-            model_name = "facebook/bart-large-cnn"
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.news_pipeline = pipeline(
-                "summarization",
-                model=model_name,
-                device=self.device,
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
             )
-            # Whisper optimizado
             self.whisper_model = whisper.load_model(
-                "tiny.en" if self.device == "cpu" else "small",
                 device=self.device
             )
         except Exception as e:
-            logger.error(f"Error loading models: {str(e)}")
             raise
     def transcribe_audio(self, audio_path: str) -> str:
-        """Transcripción optimizada de audio"""
         try:
             result = self.whisper_model.transcribe(audio_path)
             return result.get("text", "")
         except Exception as e:
-            logger.error(f"Transcription error: {str(e)}")
             return ""
-    def generate_news(self, inputs: str, max_length: int = 200) -> str:
-        """Generación de noticias con control de recursos"""
         try:
-            return self.news_pipeline(
-                inputs,
-                max_length=max_length,
-                min_length=30,
-                do_sample=False,  # Mejor rendimiento
-                truncation=True
-            )[0]['summary_text']
         except Exception as e:
-            logger.error(f"Generation error: {str(e)}")
-            return "Error generating content"
 def read_document(file_path: str) -> str:
     """Lectura optimizada de documentos"""
     try:
         if file_path.endswith(".pdf"):
-            import fitz
             with fitz.open(file_path) as doc:
                 return " ".join(page.get_text() for page in doc)
         elif file_path.endswith(".docx"):
             from docx import Document
             return " ".join(p.text for p in Document(file_path).paragraphs)
-        elif file_path.endswith((".xlsx", ".csv")):
             import pandas as pd
-            return pd.read_excel(file_path).to_string() if file_path.endswith(".xlsx") else pd.read_csv(file_path).to_string()
         return ""
     except Exception as e:
-        logger.error(f"Document error: {str(e)}")
         return ""
 def create_interface():
-    """Interfaz optimizada con Gradio"""
     generator = NewsGenerator()
-    with gr.Blocks(title="Generador de Noticias Eficiente") as app:
-        gr.Markdown("## 📰 Generador de Noticias Optimizado")
         with gr.Row():
             with gr.Column(scale=2):
-                inputs = gr.Textbox(label="Entrada Principal", lines=5)
-                max_length = gr.Slider(100, 500, value=200, label="Longitud Máxima")
                 generate_btn = gr.Button("Generar Noticia", variant="primary")
-            with gr.Column(scale=1):
-                doc_upload = gr.File(label="Subir Documento", file_types=[".pdf", ".docx", ".xlsx", ".csv"])
-                audio_upload = gr.File(label="Subir Audio", file_types=["audio", "video"])
-        output = gr.Textbox(label="Noticia Generada", lines=10, interactive=False)
-        def process_inputs(
             main_input: str,
             document: Optional[str],
             audio: Optional[str],
-            max_len: int
         ):
             try:
-                # Procesar documentos y audio
                 doc_content = read_document(document) if document else ""
                 audio_content = generator.transcribe_audio(audio) if audio else ""
-                # Construir prompt
-                full_input = "\n".join([
-                    main_input,
-                    f"Documento: {doc_content}",
-                    f"Audio: {audio_content}"
-                ])
-                return generator.generate_news(full_input, max_len)
             except Exception as e:
-                logger.error(f"Processing error: {str(e)}")
-                return f"Error: {str(e)}"
         generate_btn.click(
-            fn=process_inputs,
-            inputs=[inputs, doc_upload, audio_upload, max_length],
-            outputs=output
         )
     return app

 import gradio as gr
 import logging
 import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import whisper
+from huggingface_hub import login
 from pydub import AudioSegment
 import requests
 from bs4 import BeautifulSoup
+from typing import Optional, Dict, Any
+import fitz  # PyMuPDF
+# Configuración de logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
+# Autenticación Hugging Face (reemplaza con tu token)
+HF_TOKEN = "hf_tu_token_aqui"
+login(token=HF_TOKEN)
 class NewsGenerator:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.whisper_model = None
+        self.llm_model = None
         self.tokenizer = None
         self._load_models()
     def _load_models(self):
+        """Carga optimizada de modelos con quantización 4-bit"""
         try:
+            # Modelo Llama-2 7B Chat
+            model_name = "meta-llama/Llama-2-7b-chat-hf"
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                use_fast=True,
+                token=HF_TOKEN
+            )
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map="auto",
+                torch_dtype=torch.float16,
+                load_in_4bit=True,
+                low_cpu_mem_usage=True,
+                token=HF_TOKEN
             )
+            # Configuración de Whisper
             self.whisper_model = whisper.load_model(
+                "small.en" if self.device == "cpu" else "medium",
                 device=self.device
             )
         except Exception as e:
+            logger.error(f"Error cargando modelos: {str(e)}")
             raise
     def transcribe_audio(self, audio_path: str) -> str:
+        """Transcripción de audio con manejo de errores"""
         try:
             result = self.whisper_model.transcribe(audio_path)
             return result.get("text", "")
         except Exception as e:
+            logger.error(f"Error en transcripción: {str(e)}")
             return ""
+    def generate_news(self, prompt: str, max_length: int = 512) -> str:
+        """Generación de noticias con Llama-2"""
         try:
+            inputs = self.tokenizer(
+                f"[INST]<<SYS>>Eres un periodista profesional. Genera una noticia bien estructurada basada en los siguientes datos:<</SYS>>\n{prompt}[/INST]",
+                return_tensors="pt"
+            ).to(self.device)
+            outputs = self.llm_model.generate(
+                **inputs,
+                max_new_tokens=max_length,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         except Exception as e:
+            logger.error(f"Error generando noticia: {str(e)}")
+            return "Error en generación"
 def read_document(file_path: str) -> str:
     """Lectura optimizada de documentos"""
     try:
         if file_path.endswith(".pdf"):
             with fitz.open(file_path) as doc:
                 return " ".join(page.get_text() for page in doc)
         elif file_path.endswith(".docx"):
             from docx import Document
             return " ".join(p.text for p in Document(file_path).paragraphs)
+        elif file_path.endswith(".xlsx"):
+            import pandas as pd
+            return pd.read_excel(file_path).to_string()
+        elif file_path.endswith(".csv"):
             import pandas as pd
+            return pd.read_csv(file_path).to_string()
+        return ""
+    except Exception as e:
+        logger.error(f"Error leyendo documento: {str(e)}")
         return ""
+def read_url(url: str) -> str:
+    """Extracción de contenido web"""
+    try:
+        response = requests.get(url, timeout=15)
+        response.raise_for_status()
+        return BeautifulSoup(response.content, 'html.parser').get_text(separator=' ', strip=True)
     except Exception as e:
+        logger.error(f"Error leyendo URL: {str(e)}")
         return ""
+def process_social_media(url: str) -> Dict[str, Any]:
+    """Procesamiento de contenido social"""
+    try:
+        text = read_url(url)
+        return {"text": text, "video": None}
+    except Exception as e:
+        logger.error(f"Error procesando red social: {str(e)}")
+        return {"text": "", "video": None}
 def create_interface():
+    """Interfaz de usuario con Gradio"""
     generator = NewsGenerator()
+    with gr.Blocks(title="Generador de Noticias AI", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 📰 Generador de Noticias Profesional")
         with gr.Row():
+            with gr.Column(scale=3):
+                main_input = gr.Textbox(
+                    label="Tema principal",
+                    placeholder="Ingrese el tema o instrucciones principales...",
+                    lines=3
+                )
+                additional_data = gr.Textbox(
+                    label="Datos adicionales",
+                    placeholder="Hechos clave, nombres, fechas, etc...",
+                    lines=3
+                )
+                with gr.Accordion("Fuentes adicionales", open=False):
+                    doc_upload = gr.File(
+                        label="Subir documento",
+                        file_types=[".pdf", ".docx", ".xlsx", ".csv"]
+                    )
+                    audio_upload = gr.File(
+                        label="Subir audio/video",
+                        file_types=["audio", "video"]
+                    )
+                    url_input = gr.Textbox(
+                        label="URL de referencia",
+                        placeholder="https://..."
+                    )
+                    social_input = gr.Textbox(
+                        label="URL de red social",
+                        placeholder="https://..."
+                    )
+                length_slider = gr.Slider(
+                    100, 1000, value=400,
+                    label="Longitud de la noticia (palabras)"
+                )
+                tone_select = gr.Dropdown(
+                    label="Tono periodístico",
+                    choices=["Formal", "Neutral", "Investigativo", "Narrativo"],
+                    value="Neutral"
+                )
             with gr.Column(scale=2):
+                output_news = gr.Textbox(
+                    label="Noticia generada",
+                    lines=18,
+                    interactive=False
+                )
                 generate_btn = gr.Button("Generar Noticia", variant="primary")
+                status = gr.Textbox(label="Estado", interactive=False)
+        def process_and_generate(
             main_input: str,
+            additional_data: str,
             document: Optional[str],
             audio: Optional[str],
+            url: Optional[str],
+            social_url: Optional[str],
+            length: int,
+            tone: str
         ):
             try:
+                # Procesar fuentes adicionales
                 doc_content = read_document(document) if document else ""
                 audio_content = generator.transcribe_audio(audio) if audio else ""
+                url_content = read_url(url) if url else ""
+                social_content = process_social_media(social_url) if social_url else {"text": ""}
+                # Construir prompt estructurado
+                prompt = f"""
+                ## Instrucciones:
+                - Tema principal: {main_input}
+                - Datos proporcionados: {additional_data}
+                - Tono requerido: {tone}
+                ## Fuentes:
+                - Documento: {doc_content[:1000]}...
+                - Audio: {audio_content[:500]}...
+                - URL: {url_content[:1000]}...
+                - Red social: {social_content['text'][:500]}...
+                ## Requisitos:
+                - Estructura profesional (titular, lead, cuerpo)
+                - Incluir las 5W
+                - Citas relevantes si aplica
+                - Longitud: {length} palabras
+                """
+                return generator.generate_news(prompt, length), "✅ Generación exitosa"
             except Exception as e:
+                logger.error(str(e))
+                return f"Error: {str(e)}", "❌ Error en generación"
         generate_btn.click(
+            fn=process_and_generate,
+            inputs=[
+                main_input,
+                additional_data,
+                doc_upload,
+                audio_upload,
+                url_input,
+                social_input,
+                length_slider,
+                tone_select
+            ],
+            outputs=[output_news, status]
         )
     return app