CamiloVega commited on
Commit
07cea38
·
verified ·
1 Parent(s): aeb217c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -73
app.py CHANGED
@@ -12,25 +12,25 @@ from typing import Optional, Dict, Any
12
  import fitz # PyMuPDF
13
  import os
14
 
15
- # Logging configuration
16
  logging.basicConfig(
17
  level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s - %(message)s'
19
  )
20
  logger = logging.getLogger(__name__)
21
 
22
- # Get Hugging Face token from environment variable
23
  HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
24
  if not HUGGINGFACE_TOKEN:
25
- logger.warning("HUGGINGFACE_TOKEN not found in environment variables")
26
- raise ValueError("HUGGINGFACE_TOKEN is not configured in environment variables")
27
 
28
- # Hugging Face Authentication
29
  login(token=HUGGINGFACE_TOKEN)
30
 
31
  class NewsGenerator:
32
  def __init__(self):
33
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
34
  self.whisper_model = None
35
  self.llm_model = None
36
  self.tokenizer = None
@@ -38,10 +38,10 @@ class NewsGenerator:
38
  self._load_models()
39
 
40
  def _load_models(self):
41
- """Optimized model loading with 4-bit quantization"""
42
  try:
43
- # Llama-2 7B Chat Model
44
- model_name = "meta-llama/Llama-2-7b-chat-hf"
45
  self.tokenizer = AutoTokenizer.from_pretrained(
46
  model_name,
47
  use_fast=True,
@@ -50,37 +50,43 @@ class NewsGenerator:
50
 
51
  self.llm_model = AutoModelForCausalLM.from_pretrained(
52
  model_name,
53
- device_map="auto",
54
- torch_dtype=torch.float16,
55
- load_in_4bit=True,
56
  low_cpu_mem_usage=True,
57
  token=HUGGINGFACE_TOKEN
58
  )
59
 
60
- # Whisper Configuration
61
  self.whisper_model = whisper.load_model(
62
- "small.en" if self.device == "cpu" else "medium",
63
- device=self.device
64
  )
65
 
66
  except Exception as e:
67
- logger.error(f"Error loading models: {str(e)}")
68
  raise
69
 
70
  def transcribe_audio(self, audio_path: str) -> str:
71
- """Audio transcription with error handling"""
72
  try:
73
  result = self.whisper_model.transcribe(audio_path)
74
  return result.get("text", "")
75
  except Exception as e:
76
- logger.error(f"Transcription error: {str(e)}")
77
  return ""
78
 
79
  def generate_news(self, prompt: str, max_length: int = 512) -> str:
80
- """News generation with Llama-2"""
81
  try:
 
 
 
 
 
 
 
82
  inputs = self.tokenizer(
83
- f"[INST]<<SYS>>You are a professional journalist. Generate a well-structured news article based on the following data:<</SYS>>\n{prompt}[/INST]",
84
  return_tensors="pt"
85
  ).to(self.device)
86
 
@@ -96,11 +102,11 @@ class NewsGenerator:
96
  return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
97
 
98
  except Exception as e:
99
- logger.error(f"Generation error: {str(e)}")
100
- return "Generation error"
101
 
102
  def read_document(file_path: str) -> str:
103
- """Optimized document reading"""
104
  try:
105
  if file_path.endswith(".pdf"):
106
  with fitz.open(file_path) as doc:
@@ -116,99 +122,98 @@ def read_document(file_path: str) -> str:
116
  return pd.read_csv(file_path).to_string()
117
  return ""
118
  except Exception as e:
119
- logger.error(f"Document reading error: {str(e)}")
120
  return ""
121
 
122
  def read_url(url: str) -> str:
123
- """Web content extraction"""
124
  try:
125
  response = requests.get(url, timeout=15)
126
  response.raise_for_status()
127
  return BeautifulSoup(response.content, 'html.parser').get_text(separator=' ', strip=True)
128
  except Exception as e:
129
- logger.error(f"URL reading error: {str(e)}")
130
  return ""
131
 
132
  def process_social_media(url: str) -> Dict[str, Any]:
133
- """Social media content processing"""
134
  try:
135
  text = read_url(url)
136
  return {"text": text, "video": None}
137
  except Exception as e:
138
- logger.error(f"Social media processing error: {str(e)}")
139
  return {"text": "", "video": None}
140
 
141
  def create_interface():
142
- """Gradio user interface"""
143
  generator = NewsGenerator()
144
 
145
- with gr.Blocks(title="AI News Generator", theme=gr.themes.Soft()) as app:
146
  gr.Markdown("""
147
- # 📰 AI News Generator
148
 
149
- Transform raw data into professionally written news articles using advanced AI. This tool combines
150
- multiple sources including text, documents, audio, and web content to generate comprehensive news stories.
151
 
152
- ### Features:
153
- - Multi-source input processing (text, documents, audio, web content)
154
- - Professional journalistic writing styles
155
- - Automatic audio transcription
156
- - Customizable article length and tone
157
 
158
  ---
159
- Developed by Camilo Vega, AI Consultant
160
- [LinkedIn Profile](https://www.linkedin.com/in/camilo-vega-169084b1/)
161
  """)
162
 
163
  with gr.Row():
164
  with gr.Column(scale=3):
165
  main_input = gr.Textbox(
166
- label="Main Topic",
167
- placeholder="Enter the main topic or instructions...",
168
  lines=3
169
  )
170
  additional_data = gr.Textbox(
171
- label="Additional Data",
172
- placeholder="Key facts, names, dates, etc...",
173
  lines=3
174
  )
175
 
176
- with gr.Accordion("Additional Sources", open=False):
177
  doc_upload = gr.File(
178
- label="Upload Document",
179
  file_types=[".pdf", ".docx", ".xlsx", ".csv"]
180
  )
181
  audio_upload = gr.File(
182
- label="Upload Audio/Video",
183
  file_types=["audio", "video"]
184
  )
185
  url_input = gr.Textbox(
186
- label="Reference URL",
187
  placeholder="https://..."
188
  )
189
  social_input = gr.Textbox(
190
- label="Social Media URL",
191
  placeholder="https://..."
192
  )
193
 
194
  length_slider = gr.Slider(
195
  100, 1000, value=400,
196
- label="Article Length (words)"
197
  )
198
  tone_select = gr.Dropdown(
199
- label="Journalistic Tone",
200
- choices=["Formal", "Neutral", "Investigative", "Narrative"],
201
  value="Neutral"
202
  )
203
 
204
  with gr.Column(scale=2):
205
  output_news = gr.Textbox(
206
- label="Generated Article",
207
  lines=18,
208
  interactive=False
209
  )
210
- generate_btn = gr.Button("Generate Article", variant="primary")
211
- status = gr.Textbox(label="Status", interactive=False)
212
 
213
  def process_and_generate(
214
  main_input: str,
@@ -221,37 +226,37 @@ def create_interface():
221
  tone: str
222
  ):
223
  try:
224
- # Process additional sources
225
  doc_content = read_document(document) if document else ""
226
  audio_content = generator.transcribe_audio(audio) if audio else ""
227
  url_content = read_url(url) if url else ""
228
  social_content = process_social_media(social_url) if social_url else {"text": ""}
229
 
230
- # Build structured prompt
231
  prompt = f"""
232
- ## Instructions:
233
- - Main Topic: {main_input}
234
- - Provided Data: {additional_data}
235
- - Required Tone: {tone}
236
 
237
- ## Sources:
238
- - Document: {doc_content[:1000]}...
239
- - Audio: {audio_content[:500]}...
240
- - URL: {url_content[:1000]}...
241
- - Social Media: {social_content['text'][:500]}...
242
 
243
- ## Requirements:
244
- - Professional structure (headline, lead, body)
245
- - Include the 5W's
246
- - Relevant quotes if applicable
247
- - Length: {length} words
248
  """
249
 
250
- return generator.generate_news(prompt, length), "✅ Generation successful"
251
 
252
  except Exception as e:
253
  logger.error(str(e))
254
- return f"Error: {str(e)}", "❌ Generation error"
255
 
256
  generate_btn.click(
257
  fn=process_and_generate,
 
12
  import fitz # PyMuPDF
13
  import os
14
 
15
+ # Configuración de logging
16
  logging.basicConfig(
17
  level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s - %(message)s'
19
  )
20
  logger = logging.getLogger(__name__)
21
 
22
+ # Obtener token de Hugging Face
23
  HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
24
  if not HUGGINGFACE_TOKEN:
25
+ logger.warning("HUGGINGFACE_TOKEN no encontrado en variables de entorno")
26
+ raise ValueError("Configura HUGGINGFACE_TOKEN en las variables de entorno")
27
 
28
+ # Autenticación en Hugging Face
29
  login(token=HUGGINGFACE_TOKEN)
30
 
31
  class NewsGenerator:
32
  def __init__(self):
33
+ self.device = "cpu" # Forzar uso de CPU
34
  self.whisper_model = None
35
  self.llm_model = None
36
  self.tokenizer = None
 
38
  self._load_models()
39
 
40
  def _load_models(self):
41
+ """Carga optimizada de modelos para CPU"""
42
  try:
43
+ # Modelo DeepSeek ligero
44
+ model_name = "deepseek-ai/deepseek-r1-distill-queen-1.5b"
45
  self.tokenizer = AutoTokenizer.from_pretrained(
46
  model_name,
47
  use_fast=True,
 
50
 
51
  self.llm_model = AutoModelForCausalLM.from_pretrained(
52
  model_name,
53
+ device_map="cpu",
54
+ torch_dtype=torch.float32, # Usar float32 para CPU
 
55
  low_cpu_mem_usage=True,
56
  token=HUGGINGFACE_TOKEN
57
  )
58
 
59
+ # Configuración de Whisper (versión reducida)
60
  self.whisper_model = whisper.load_model(
61
+ "tiny.en",
62
+ device="cpu"
63
  )
64
 
65
  except Exception as e:
66
+ logger.error(f"Error cargando modelos: {str(e)}")
67
  raise
68
 
69
  def transcribe_audio(self, audio_path: str) -> str:
70
+ """Transcripción de audio con manejo de errores"""
71
  try:
72
  result = self.whisper_model.transcribe(audio_path)
73
  return result.get("text", "")
74
  except Exception as e:
75
+ logger.error(f"Error en transcripción: {str(e)}")
76
  return ""
77
 
78
  def generate_news(self, prompt: str, max_length: int = 512) -> str:
79
+ """Generación de noticias con DeepSeek"""
80
  try:
81
+ # Formato de prompt específico para DeepSeek
82
+ formatted_prompt = (
83
+ f"<|System|>\nEres un periodista profesional. Genera un artículo noticioso "
84
+ f"basado en estos datos:\n{prompt}\n<|End|>\n"
85
+ f"<|User|>\nRedacta el artículo:<|End|>\n<|Assistant|>"
86
+ )
87
+
88
  inputs = self.tokenizer(
89
+ formatted_prompt,
90
  return_tensors="pt"
91
  ).to(self.device)
92
 
 
102
  return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
103
 
104
  except Exception as e:
105
+ logger.error(f"Error en generación: {str(e)}")
106
+ return "Error generando el artículo"
107
 
108
  def read_document(file_path: str) -> str:
109
+ """Lectura optimizada de documentos"""
110
  try:
111
  if file_path.endswith(".pdf"):
112
  with fitz.open(file_path) as doc:
 
122
  return pd.read_csv(file_path).to_string()
123
  return ""
124
  except Exception as e:
125
+ logger.error(f"Error leyendo documento: {str(e)}")
126
  return ""
127
 
128
  def read_url(url: str) -> str:
129
+ """Extracción de contenido web"""
130
  try:
131
  response = requests.get(url, timeout=15)
132
  response.raise_for_status()
133
  return BeautifulSoup(response.content, 'html.parser').get_text(separator=' ', strip=True)
134
  except Exception as e:
135
+ logger.error(f"Error leyendo URL: {str(e)}")
136
  return ""
137
 
138
  def process_social_media(url: str) -> Dict[str, Any]:
139
+ """Procesamiento de redes sociales"""
140
  try:
141
  text = read_url(url)
142
  return {"text": text, "video": None}
143
  except Exception as e:
144
+ logger.error(f"Error procesando red social: {str(e)}")
145
  return {"text": "", "video": None}
146
 
147
  def create_interface():
148
+ """Interfaz de usuario de Gradio"""
149
  generator = NewsGenerator()
150
 
151
+ with gr.Blocks(title="Generador de Noticias AI", theme=gr.themes.Soft()) as app:
152
  gr.Markdown("""
153
+ # 📰 Generador de Noticias AI
154
 
155
+ Transforma datos en bruto en artículos periodísticos profesionales usando IA avanzada.
 
156
 
157
+ ### Características:
158
+ - Procesamiento multi-fuente (texto, documentos, audio, web)
159
+ - Estilos periodísticos profesionales
160
+ - Transcripción automática de audio
161
+ - Longitud y tono personalizables
162
 
163
  ---
164
+ Desarrollado por Camilo Vega, Consultor en IA
165
+ [Perfil de LinkedIn](https://www.linkedin.com/in/camilo-vega-169084b1/)
166
  """)
167
 
168
  with gr.Row():
169
  with gr.Column(scale=3):
170
  main_input = gr.Textbox(
171
+ label="Tema Principal",
172
+ placeholder="Ingrese el tema principal o instrucciones...",
173
  lines=3
174
  )
175
  additional_data = gr.Textbox(
176
+ label="Datos Adicionales",
177
+ placeholder="Hechos clave, nombres, fechas...",
178
  lines=3
179
  )
180
 
181
+ with gr.Accordion("Fuentes Adicionales", open=False):
182
  doc_upload = gr.File(
183
+ label="Subir Documento",
184
  file_types=[".pdf", ".docx", ".xlsx", ".csv"]
185
  )
186
  audio_upload = gr.File(
187
+ label="Subir Audio/Video",
188
  file_types=["audio", "video"]
189
  )
190
  url_input = gr.Textbox(
191
+ label="URL de Referencia",
192
  placeholder="https://..."
193
  )
194
  social_input = gr.Textbox(
195
+ label="URL de Red Social",
196
  placeholder="https://..."
197
  )
198
 
199
  length_slider = gr.Slider(
200
  100, 1000, value=400,
201
+ label="Longitud del Artículo (palabras)"
202
  )
203
  tone_select = gr.Dropdown(
204
+ label="Tono Periodístico",
205
+ choices=["Formal", "Neutral", "Investigativo", "Narrativo"],
206
  value="Neutral"
207
  )
208
 
209
  with gr.Column(scale=2):
210
  output_news = gr.Textbox(
211
+ label="Artículo Generado",
212
  lines=18,
213
  interactive=False
214
  )
215
+ generate_btn = gr.Button("Generar Artículo", variant="primary")
216
+ status = gr.Textbox(label="Estado", interactive=False)
217
 
218
  def process_and_generate(
219
  main_input: str,
 
226
  tone: str
227
  ):
228
  try:
229
+ # Procesar fuentes adicionales
230
  doc_content = read_document(document) if document else ""
231
  audio_content = generator.transcribe_audio(audio) if audio else ""
232
  url_content = read_url(url) if url else ""
233
  social_content = process_social_media(social_url) if social_url else {"text": ""}
234
 
235
+ # Construir prompt estructurado
236
  prompt = f"""
237
+ ## Instrucciones:
238
+ - Tema Principal: {main_input}
239
+ - Datos Proporcionados: {additional_data}
240
+ - Tono Requerido: {tone}
241
 
242
+ ## Fuentes:
243
+ - Documento: {doc_content[:500]}...
244
+ - Audio: {audio_content[:300]}...
245
+ - URL: {url_content[:500]}...
246
+ - Red Social: {social_content['text'][:300]}...
247
 
248
+ ## Requisitos:
249
+ - Estructura profesional (titular, lead, cuerpo)
250
+ - Incluir las 5W
251
+ - Citas relevantes si aplica
252
+ - Longitud: {length} palabras
253
  """
254
 
255
+ return generator.generate_news(prompt, length), "✅ Generación exitosa"
256
 
257
  except Exception as e:
258
  logger.error(str(e))
259
+ return f"Error: {str(e)}", "❌ Error en generación"
260
 
261
  generate_btn.click(
262
  fn=process_and_generate,