Bils commited on
Commit
216e869
·
verified ·
1 Parent(s): 1653c85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -30
app.py CHANGED
@@ -8,6 +8,7 @@ from pydub import AudioSegment
8
  from dotenv import load_dotenv
9
  import spaces
10
  import gradio as gr
 
11
 
12
  # Transformers & Models
13
  from transformers import (
@@ -17,9 +18,13 @@ from transformers import (
17
  AutoProcessor,
18
  MusicgenForConditionalGeneration,
19
  )
 
20
  # Coqui TTS
21
  from TTS.api import TTS
22
 
 
 
 
23
  # ---------------------------------------------------------------------
24
  # Setup Logging and Environment Variables
25
  # ---------------------------------------------------------------------
@@ -33,6 +38,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
33
  LLAMA_PIPELINES = {}
34
  MUSICGEN_MODELS = {}
35
  TTS_MODELS = {}
 
36
 
37
  # ---------------------------------------------------------------------
38
  # Utility Function
@@ -65,7 +71,6 @@ def get_llama_pipeline(model_id: str, token: str):
65
  LLAMA_PIPELINES[model_id] = text_pipeline
66
  return text_pipeline
67
 
68
-
69
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
70
  """
71
  Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
@@ -81,7 +86,6 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
81
  MUSICGEN_MODELS[model_key] = (model, processor)
82
  return model, processor
83
 
84
-
85
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
86
  """
87
  Returns a cached TTS model if available; otherwise, loads and caches it.
@@ -93,6 +97,16 @@ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
93
  TTS_MODELS[model_name] = tts_model
94
  return tts_model
95
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # ---------------------------------------------------------------------
98
  # Script Generation Function
@@ -127,7 +141,6 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
127
  if "Output:" in generated_text:
128
  generated_text = generated_text.split("Output:")[-1].strip()
129
 
130
-
131
  pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
132
  match = re.search(pattern, generated_text, re.DOTALL)
133
  if match:
@@ -143,7 +156,6 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
143
  logging.exception("Error generating script")
144
  return f"Error generating script: {e}", "", ""
145
 
146
-
147
  # ---------------------------------------------------------------------
148
  # Voice-Over Generation Function
149
  # ---------------------------------------------------------------------
@@ -168,7 +180,6 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
168
  logging.exception("Error generating voice")
169
  return f"Error generating voice: {e}"
170
 
171
-
172
  # ---------------------------------------------------------------------
173
  # Music Generation Function
174
  # ---------------------------------------------------------------------
@@ -202,43 +213,85 @@ def generate_music(prompt: str, audio_length: int):
202
  logging.exception("Error generating music")
203
  return f"Error generating music: {e}"
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # ---------------------------------------------------------------------
207
- # Audio Blending with Duration Sync & Ducking
208
  # ---------------------------------------------------------------------
209
  @spaces.GPU(duration=100)
210
- def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
211
  """
212
- Blends two audio files (voice and music).
213
- - Loops music if shorter than voice.
214
- - Trims music if longer than voice.
215
- - Applies ducking to lower music volume during voice segments if enabled.
216
  Returns the file path to the blended .wav file.
217
  """
218
  try:
219
- if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
220
- return "Error: Missing audio files for blending."
 
 
221
 
 
222
  voice = AudioSegment.from_wav(voice_path)
223
  music = AudioSegment.from_wav(music_path)
 
224
 
225
- voice_len = len(voice)
226
 
227
-
228
  if len(music) < voice_len:
229
  looped_music = AudioSegment.empty()
230
  while len(looped_music) < voice_len:
231
  looped_music += music
232
  music = looped_music
233
-
234
-
235
  music = music[:voice_len]
236
 
 
 
 
 
 
 
 
 
 
237
  if ducking:
238
- ducked_music = music - duck_level
239
- final_audio = ducked_music.overlay(voice)
240
- else:
241
- final_audio = music.overlay(voice)
 
 
 
 
242
 
243
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
244
  final_audio.export(output_path, format="wav")
@@ -248,7 +301,6 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
248
  logging.exception("Error blending audio")
249
  return f"Error blending audio: {e}"
250
 
251
-
252
  # ---------------------------------------------------------------------
253
  # Gradio Interface
254
  # ---------------------------------------------------------------------
@@ -298,20 +350,21 @@ with gr.Blocks(css="""
298
  <p>Your all-in-one AI solution for creating professional audio ads.</p>
299
  """)
300
 
301
-
302
  gr.Markdown("""
303
  **Welcome to Ai Ads Promo!**
304
 
305
  This simple, easy-to-use app helps you create amazing audio ads in just a few steps. Here’s how it works:
306
 
307
  1. **Script Generation:**
308
- - Share your idea and let our AI craft a clear and engaging voice-over script.
309
  2. **Voice Synthesis:**
310
  - Convert your script into a natural-sounding voice-over using advanced text-to-speech technology.
311
  3. **Music Production:**
312
  - Generate a custom music track that perfectly fits your ad.
313
- 4. **Audio Blending:**
314
- - Combine your voice-over and music seamlessly. You can even adjust the music volume (ducking) when the voice plays.
 
 
315
 
316
  **Benefits:**
317
  - **Easy to Use:** Designed for everyone – no technical skills required.
@@ -396,9 +449,21 @@ with gr.Blocks(css="""
396
  outputs=[music_output],
397
  )
398
 
399
- # Step 4: Audio Blending
 
 
 
 
 
 
 
 
 
 
 
 
400
  with gr.Tab("🎚️ Audio Blending"):
401
- gr.Markdown("Blend your voice-over and music track. Music will be looped or trimmed to match your voice duration. Enable ducking to lower the music while the voice plays.")
402
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
403
  duck_level_slider = gr.Slider(
404
  label="Ducking Level (dB attenuation)",
@@ -407,12 +472,12 @@ with gr.Blocks(css="""
407
  step=1,
408
  value=10
409
  )
410
- blend_button = gr.Button("Blend Voice + Music", variant="primary")
411
  blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
412
 
413
  blend_button.click(
414
  fn=blend_audio,
415
- inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
416
  outputs=blended_output
417
  )
418
 
 
8
  from dotenv import load_dotenv
9
  import spaces
10
  import gradio as gr
11
+ import numpy as np
12
 
13
  # Transformers & Models
14
  from transformers import (
 
18
  AutoProcessor,
19
  MusicgenForConditionalGeneration,
20
  )
21
+
22
  # Coqui TTS
23
  from TTS.api import TTS
24
 
25
+ # Diffusers for sound design generation
26
+ from diffusers import DiffusionPipeline
27
+
28
  # ---------------------------------------------------------------------
29
  # Setup Logging and Environment Variables
30
  # ---------------------------------------------------------------------
 
38
  LLAMA_PIPELINES = {}
39
  MUSICGEN_MODELS = {}
40
  TTS_MODELS = {}
41
+ SOUND_DESIGN_PIPELINES = {}
42
 
43
  # ---------------------------------------------------------------------
44
  # Utility Function
 
71
  LLAMA_PIPELINES[model_id] = text_pipeline
72
  return text_pipeline
73
 
 
74
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
75
  """
76
  Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
 
86
  MUSICGEN_MODELS[model_key] = (model, processor)
87
  return model, processor
88
 
 
89
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
90
  """
91
  Returns a cached TTS model if available; otherwise, loads and caches it.
 
97
  TTS_MODELS[model_name] = tts_model
98
  return tts_model
99
 
100
+ def get_sound_design_pipeline(model_name: str, token: str):
101
+ """
102
+ Returns a cached DiffusionPipeline for sound design if available;
103
+ otherwise, it loads and caches the pipeline.
104
+ """
105
+ if model_name in SOUND_DESIGN_PIPELINES:
106
+ return SOUND_DESIGN_PIPELINES[model_name]
107
+ pipe = DiffusionPipeline.from_pretrained(model_name, use_auth_token=token)
108
+ SOUND_DESIGN_PIPELINES[model_name] = pipe
109
+ return pipe
110
 
111
  # ---------------------------------------------------------------------
112
  # Script Generation Function
 
141
  if "Output:" in generated_text:
142
  generated_text = generated_text.split("Output:")[-1].strip()
143
 
 
144
  pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
145
  match = re.search(pattern, generated_text, re.DOTALL)
146
  if match:
 
156
  logging.exception("Error generating script")
157
  return f"Error generating script: {e}", "", ""
158
 
 
159
  # ---------------------------------------------------------------------
160
  # Voice-Over Generation Function
161
  # ---------------------------------------------------------------------
 
180
  logging.exception("Error generating voice")
181
  return f"Error generating voice: {e}"
182
 
 
183
  # ---------------------------------------------------------------------
184
  # Music Generation Function
185
  # ---------------------------------------------------------------------
 
213
  logging.exception("Error generating music")
214
  return f"Error generating music: {e}"
215
 
216
+ # ---------------------------------------------------------------------
217
+ # Sound Design Generation Function
218
+ # ---------------------------------------------------------------------
219
+ @spaces.GPU(duration=200)
220
+ def generate_sound_design(prompt: str):
221
+ """
222
+ Generates a sound design audio file based on the provided prompt using Audioldm2.
223
+ Returns the file path to the generated .wav file.
224
+ """
225
+ try:
226
+ if not prompt.strip():
227
+ return "Error: No sound design suggestion provided."
228
+
229
+ pipe = get_sound_design_pipeline("cvssp/audioldm2", HF_TOKEN)
230
+
231
+ # Generate audio from the prompt; assumes the pipeline returns a dict with key 'audios'
232
+ result = pipe(prompt)
233
+ audio_samples = result["audios"][0]
234
+
235
+ normalized_audio = (audio_samples / np.max(np.abs(audio_samples)) * 32767).astype("int16")
236
+ output_path = os.path.join(tempfile.gettempdir(), "sound_design_generated.wav")
237
+ write(output_path, 44100, normalized_audio)
238
+ return output_path
239
+
240
+ except Exception as e:
241
+ logging.exception("Error generating sound design")
242
+ return f"Error generating sound design: {e}"
243
 
244
  # ---------------------------------------------------------------------
245
+ # Audio Blending with Duration Sync & Ducking (Voice + Sound Design + Music)
246
  # ---------------------------------------------------------------------
247
  @spaces.GPU(duration=100)
248
+ def blend_audio(voice_path: str, sound_effect_path: str, music_path: str, ducking: bool, duck_level: int = 10):
249
  """
250
+ Blends three audio files (voice, sound design/sound effect, and music):
251
+ - Loops music and sound design if shorter than the voice track.
252
+ - Trims both to match the voice duration.
253
+ - Applies ducking to lower music and sound design volumes during voice segments if enabled.
254
  Returns the file path to the blended .wav file.
255
  """
256
  try:
257
+ # Verify input files exist
258
+ for path in [voice_path, sound_effect_path, music_path]:
259
+ if not os.path.isfile(path):
260
+ return f"Error: Missing audio file for {path}"
261
 
262
+ # Load audio segments
263
  voice = AudioSegment.from_wav(voice_path)
264
  music = AudioSegment.from_wav(music_path)
265
+ sound_effect = AudioSegment.from_wav(sound_effect_path)
266
 
267
+ voice_len = len(voice) # duration in milliseconds
268
 
269
+ # Loop or trim music to match voice duration
270
  if len(music) < voice_len:
271
  looped_music = AudioSegment.empty()
272
  while len(looped_music) < voice_len:
273
  looped_music += music
274
  music = looped_music
 
 
275
  music = music[:voice_len]
276
 
277
+ # Loop or trim sound effect to match voice duration
278
+ if len(sound_effect) < voice_len:
279
+ looped_effect = AudioSegment.empty()
280
+ while len(looped_effect) < voice_len:
281
+ looped_effect += sound_effect
282
+ sound_effect = looped_effect
283
+ sound_effect = sound_effect[:voice_len]
284
+
285
+ # Apply ducking to background tracks if enabled
286
  if ducking:
287
+ music = music - duck_level
288
+ sound_effect = sound_effect - duck_level
289
+
290
+ # Combine music and sound effect into a background track
291
+ background = music.overlay(sound_effect)
292
+
293
+ # Overlay voice on top of the background
294
+ final_audio = background.overlay(voice)
295
 
296
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
297
  final_audio.export(output_path, format="wav")
 
301
  logging.exception("Error blending audio")
302
  return f"Error blending audio: {e}"
303
 
 
304
  # ---------------------------------------------------------------------
305
  # Gradio Interface
306
  # ---------------------------------------------------------------------
 
350
  <p>Your all-in-one AI solution for creating professional audio ads.</p>
351
  """)
352
 
 
353
  gr.Markdown("""
354
  **Welcome to Ai Ads Promo!**
355
 
356
  This simple, easy-to-use app helps you create amazing audio ads in just a few steps. Here’s how it works:
357
 
358
  1. **Script Generation:**
359
+ - Share your idea and let our AI craft a clear and engaging voice-over script, along with sound design and music suggestions.
360
  2. **Voice Synthesis:**
361
  - Convert your script into a natural-sounding voice-over using advanced text-to-speech technology.
362
  3. **Music Production:**
363
  - Generate a custom music track that perfectly fits your ad.
364
+ 4. **Sound Design:**
365
+ - Generate creative sound effects based on our sound design suggestions.
366
+ 5. **Audio Blending:**
367
+ - Combine your voice-over, sound effects, and music seamlessly. Enable ducking to lower background audio during voice segments.
368
 
369
  **Benefits:**
370
  - **Easy to Use:** Designed for everyone – no technical skills required.
 
449
  outputs=[music_output],
450
  )
451
 
452
+ # Step 4: Sound Design Generation
453
+ with gr.Tab("🎧 Sound Design Generation"):
454
+ gr.Markdown("Generate a creative sound design track based on the sound design suggestions from the script.")
455
+ generate_sound_design_button = gr.Button("Generate Sound Design", variant="primary")
456
+ sound_design_audio_output = gr.Audio(label="Generated Sound Design (WAV)", type="filepath")
457
+
458
+ generate_sound_design_button.click(
459
+ fn=generate_sound_design,
460
+ inputs=[sound_design_output],
461
+ outputs=[sound_design_audio_output],
462
+ )
463
+
464
+ # Step 5: Audio Blending (Voice + Sound Design + Music)
465
  with gr.Tab("🎚️ Audio Blending"):
466
+ gr.Markdown("Blend your voice-over, sound design, and music track. The background audio (music and sound design) can be ducked during voice segments.")
467
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
468
  duck_level_slider = gr.Slider(
469
  label="Ducking Level (dB attenuation)",
 
472
  step=1,
473
  value=10
474
  )
475
+ blend_button = gr.Button("Blend Audio", variant="primary")
476
  blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
477
 
478
  blend_button.click(
479
  fn=blend_audio,
480
+ inputs=[voice_audio_output, sound_design_audio_output, music_output, ducking_checkbox, duck_level_slider],
481
  outputs=blended_output
482
  )
483