CamiloVega commited on
Commit
393aa1f
verified
1 Parent(s): fadd658

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -192
app.py CHANGED
@@ -1,235 +1,150 @@
1
  import gradio as gr
2
  import logging
3
- import os
4
  import torch
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import whisper
7
  from pydub import AudioSegment
8
  import requests
9
  from bs4 import BeautifulSoup
10
- from typing import Optional, Dict, Any
11
- from dataclasses import dataclass
12
 
 
13
  logging.basicConfig(
14
  level=logging.INFO,
15
- format='%(asctime)s - %(levelname)s) %(message)s'
16
  )
17
  logger = logging.getLogger(__name__)
18
 
19
- @dataclass
20
- class NewsConfig:
21
- model_name: str = "meta-llama/Llama-2-3b-chat-hf"
22
- max_tokens: int = 256
23
- temperature: float = 0.7
24
- top_p: float = 0.95
25
-
26
  class NewsGenerator:
27
  def __init__(self):
28
- self.config = NewsConfig()
29
- self.tokenizer = None
30
- self.model = None
31
  self.whisper_model = None
32
- self._initialize_models()
 
 
 
 
33
 
34
- def _initialize_models(self):
35
- """Initialize models with efficient settings"""
36
  try:
37
- if not self.tokenizer:
38
- self.tokenizer = AutoTokenizer.from_pretrained(
39
- self.config.model_name,
40
- use_fast=True,
41
- model_max_length=self.config.max_tokens
42
- )
43
- self.tokenizer.pad_token = self.tokenizer.eos_token
 
 
44
 
45
- if not self.model:
46
- self.model = AutoModelForCausalLM.from_pretrained(
47
- self.config.model_name,
48
- device_map="auto",
49
- torch_dtype=torch.float16,
50
- low_cpu_mem_usage=True,
51
- use_safetensors=True
52
- )
53
-
54
- if not self.whisper_model:
55
- self.whisper_model = whisper.load_model(
56
- "tiny",
57
- device="cuda" if torch.cuda.is_available() else "cpu"
58
- )
59
 
60
  except Exception as e:
61
- logger.error(f"Error initializing models: {str(e)}")
62
  raise
63
 
64
- def transcribe_audio(self, audio_file: str) -> str:
65
- """Transcribe audio file with improved error handling"""
66
  try:
67
- if not audio_file:
68
- return "Error: No audio file provided"
69
-
70
- result = self.whisper_model.transcribe(audio_file)
71
- return result.get("text", "Transcription failed")
72
-
73
  except Exception as e:
74
- logger.error(f"Audio transcription error: {str(e)}")
75
- return f"Error transcribing audio: {str(e)}"
76
 
77
- def generate_news(self, prompt: str) -> str:
78
- """Generate news article with optimized parameters"""
79
  try:
80
- with torch.inference_mode():
81
- outputs = self.model.generate(
82
- inputs=self.tokenizer(prompt, return_tensors="pt").input_ids,
83
- max_new_tokens=self.config.max_tokens,
84
- temperature=self.config.temperature,
85
- top_p=self.config.top_p,
86
- do_sample=True,
87
- early_stopping=True
88
- )
89
- return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
90
  except Exception as e:
91
- logger.error(f"News generation error: {str(e)}")
92
- return f"Error generating news: {str(e)}"
93
 
94
- def read_document(document_path: str) -> str:
95
- """Read document content with better error handling"""
96
  try:
97
- if document_path.endswith(".pdf"):
98
- with fitz.open(document_path) as doc:
99
- return "\n".join(page.get_text() for page in doc)
100
- elif document_path.endswith((".docx", ".xlsx", ".csv")):
101
- content = ""
102
- if document_path.endswith(".docx"):
103
- import docx
104
- doc = docx.Document(document_path)
105
- content = "\n".join(p.text for p in doc.paragraphs)
106
- elif document_path.endswith(".xlsx"):
107
- import pandas as pd
108
- content = pd.read_excel(document_path).to_string()
109
- elif document_path.endswith(".csv"):
110
- import pandas as pd
111
- content = pd.read_csv(document_path).to_string()
112
- return content
113
- return "Unsupported file type"
114
  except Exception as e:
115
- logger.error(f"Document reading error: {str(e)}")
116
- return f"Error reading document: {str(e)}"
117
 
118
- def read_url(url: str) -> str:
119
- """Read URL content with better handling"""
120
- try:
121
- response = requests.get(url, timeout=10)
122
- response.raise_for_status()
123
- return BeautifulSoup(response.content, 'html.parser').get_text()
124
- except Exception as e:
125
- logger.error(f"URL reading error: {str(e)}")
126
- return f"Error reading URL: {str(e)}"
127
-
128
- def process_social_media(url: str) -> Dict[str, Any]:
129
- """Process social media content with improved handling"""
130
- try:
131
- text = read_url(url)
132
- return {"text": text, "video": None}
133
- except Exception as e:
134
- logger.error(f"Social media processing error: {str(e)}")
135
- return {"text": None, "video": None}
136
-
137
- def main():
138
- """Main function to create and run the Gradio app"""
139
- news_generator = NewsGenerator()
140
 
141
- with gr.Blocks() as demo:
142
- gr.Markdown("# Generador de Noticias Optimizado")
143
-
144
- with gr.Row():
145
- instrucciones = gr.Textbox(label="Instrucciones", lines=2)
146
- hechos = gr.Textbox(label="Hechos", lines=4)
147
- tama帽o = gr.Number(label="Tama帽o (palabras)", value=100)
148
- tono = gr.Dropdown(label="Tono", choices=["serio", "neutral", "divertido"], value="neutral")
149
 
150
  with gr.Row():
151
- documento = gr.File(label="Documento", file_types=["pdf", "docx", "xlsx", "csv"])
152
- audio = gr.File(label="Audio/Video", file_types=["audio", "video"])
153
- url = gr.Textbox(label="URL")
154
- social_url = gr.Textbox(label="URL de red social")
155
-
156
- with gr.Row():
157
- generar = gr.Button("Generar Noticia")
158
- noticia = gr.Textbox(label="Noticia Generada", lines=20)
159
- transcripciones = gr.Textbox(label="Transcripciones", lines=10)
 
160
 
161
- def generate_news_output(
162
- instrucciones: str,
163
- hechos: str,
164
- tama帽o: int,
165
- tono: str,
166
- documento: Optional[gr.File],
167
- audio: Optional[gr.File],
168
- url: Optional[str],
169
- social_url: Optional[str]
170
  ):
171
  try:
172
- # Process document
173
- if documento:
174
- doc_content = read_document(documento.name)
175
- else:
176
- doc_content = ""
177
-
178
- # Process audio
179
- if audio:
180
- audio_content = news_generator.transcribe_audio(audio.name)
181
- else:
182
- audio_content = ""
183
-
184
- # Process URL
185
- if url:
186
- url_content = read_url(url)
187
- else:
188
- url_content = ""
189
-
190
- # Process social media
191
- if social_url:
192
- social_content = process_social_media(social_url)
193
- else:
194
- social_content = {"text": "", "video": ""}
195
 
196
- # Generate prompt
197
- prompt = f"""[INST] Escribe una noticia basada en la siguiente informaci贸n:
198
- Instrucciones: {instrucciones}
199
- Hechos: {hechos}
200
- Documento: {doc_content}
201
- Audio: {audio_content}
202
- URL: {url_content}
203
- Red Social: {social_content['text']}
204
- Video: {social_content['video'] if social_content else ''}
205
-
206
- Par谩metros:
207
- - Tama帽o: {tama帽o} palabras
208
- - Tono: {tono}
209
- - Incluye: T铆tulo, gancho, cuerpo, 5W
210
- - Estilo period铆stico
211
- [/INST]"""
212
-
213
- # Generate news
214
- news = news_generator.generate_news(prompt)
215
- return news, f"Transcripciones generadas correctamente"
216
 
 
 
217
  except Exception as e:
218
- return f"Error generando noticia: {str(e)}", f"Error: {str(e)}"
219
-
220
- generate_news_output(
221
- instrucciones,
222
- hechos,
223
- tama帽o,
224
- tono,
225
- documento,
226
- audio,
227
- url,
228
- social_url
229
- )(generar, [noticia, transcripciones])
230
-
231
- if __name__ == "__main__":
232
- demo.launch()
233
 
234
  if __name__ == "__main__":
235
- main()
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import logging
 
3
  import torch
4
+ from transformers import pipeline, AutoTokenizer
5
  import whisper
6
  from pydub import AudioSegment
7
  import requests
8
  from bs4 import BeautifulSoup
9
+ from typing import Optional
 
10
 
11
+ # Configuraci贸n b谩sica de logging
12
  logging.basicConfig(
13
  level=logging.INFO,
14
+ format='%(asctime)s - %(levelname)s - %(message)s'
15
  )
16
  logger = logging.getLogger(__name__)
17
 
 
 
 
 
 
 
 
18
  class NewsGenerator:
19
  def __init__(self):
20
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
21
  self.whisper_model = None
22
+ self.news_pipeline = None
23
+ self.tokenizer = None
24
+
25
+ # Carga diferida de modelos
26
+ self._load_models()
27
 
28
+ def _load_models(self):
29
+ """Carga eficiente de modelos con gesti贸n de memoria"""
30
  try:
31
+ # Modelo de texto m谩s peque帽o y eficiente
32
+ model_name = "facebook/bart-large-cnn"
33
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ self.news_pipeline = pipeline(
35
+ "summarization",
36
+ model=model_name,
37
+ device=self.device,
38
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
39
+ )
40
 
41
+ # Whisper optimizado
42
+ self.whisper_model = whisper.load_model(
43
+ "tiny.en" if self.device == "cpu" else "small",
44
+ device=self.device
45
+ )
 
 
 
 
 
 
 
 
 
46
 
47
  except Exception as e:
48
+ logger.error(f"Error loading models: {str(e)}")
49
  raise
50
 
51
+ def transcribe_audio(self, audio_path: str) -> str:
52
+ """Transcripci贸n optimizada de audio"""
53
  try:
54
+ result = self.whisper_model.transcribe(audio_path)
55
+ return result.get("text", "")
 
 
 
 
56
  except Exception as e:
57
+ logger.error(f"Transcription error: {str(e)}")
58
+ return ""
59
 
60
+ def generate_news(self, inputs: str, max_length: int = 200) -> str:
61
+ """Generaci贸n de noticias con control de recursos"""
62
  try:
63
+ return self.news_pipeline(
64
+ inputs,
65
+ max_length=max_length,
66
+ min_length=30,
67
+ do_sample=False, # Mejor rendimiento
68
+ truncation=True
69
+ )[0]['summary_text']
 
 
 
70
  except Exception as e:
71
+ logger.error(f"Generation error: {str(e)}")
72
+ return "Error generating content"
73
 
74
+ def read_document(file_path: str) -> str:
75
+ """Lectura optimizada de documentos"""
76
  try:
77
+ if file_path.endswith(".pdf"):
78
+ import fitz
79
+ with fitz.open(file_path) as doc:
80
+ return " ".join(page.get_text() for page in doc)
81
+ elif file_path.endswith(".docx"):
82
+ from docx import Document
83
+ return " ".join(p.text for p in Document(file_path).paragraphs)
84
+ elif file_path.endswith((".xlsx", ".csv")):
85
+ import pandas as pd
86
+ return pd.read_excel(file_path).to_string() if file_path.endswith(".xlsx") else pd.read_csv(file_path).to_string()
87
+ return ""
 
 
 
 
 
 
88
  except Exception as e:
89
+ logger.error(f"Document error: {str(e)}")
90
+ return ""
91
 
92
+ def create_interface():
93
+ """Interfaz optimizada con Gradio"""
94
+ generator = NewsGenerator()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ with gr.Blocks(title="Generador de Noticias Eficiente") as app:
97
+ gr.Markdown("## 馃摪 Generador de Noticias Optimizado")
 
 
 
 
 
 
98
 
99
  with gr.Row():
100
+ with gr.Column(scale=2):
101
+ inputs = gr.Textbox(label="Entrada Principal", lines=5)
102
+ max_length = gr.Slider(100, 500, value=200, label="Longitud M谩xima")
103
+ generate_btn = gr.Button("Generar Noticia", variant="primary")
104
+
105
+ with gr.Column(scale=1):
106
+ doc_upload = gr.File(label="Subir Documento", file_types=[".pdf", ".docx", ".xlsx", ".csv"])
107
+ audio_upload = gr.File(label="Subir Audio", file_types=["audio", "video"])
108
+
109
+ output = gr.Textbox(label="Noticia Generada", lines=10, interactive=False)
110
 
111
+ def process_inputs(
112
+ main_input: str,
113
+ document: Optional[str],
114
+ audio: Optional[str],
115
+ max_len: int
 
 
 
 
116
  ):
117
  try:
118
+ # Procesar documentos y audio
119
+ doc_content = read_document(document) if document else ""
120
+ audio_content = generator.transcribe_audio(audio) if audio else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ # Construir prompt
123
+ full_input = "\n".join([
124
+ main_input,
125
+ f"Documento: {doc_content}",
126
+ f"Audio: {audio_content}"
127
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ return generator.generate_news(full_input, max_len)
130
+
131
  except Exception as e:
132
+ logger.error(f"Processing error: {str(e)}")
133
+ return f"Error: {str(e)}"
134
+
135
+ generate_btn.click(
136
+ fn=process_inputs,
137
+ inputs=[inputs, doc_upload, audio_upload, max_length],
138
+ outputs=output
139
+ )
140
+
141
+ return app
 
 
 
 
 
142
 
143
  if __name__ == "__main__":
144
+ app = create_interface()
145
+ app.launch(
146
+ server_name="0.0.0.0",
147
+ server_port=7860,
148
+ share=False,
149
+ show_error=True
150
+ )