Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
-
import gradio as gr
|
2 |
import os
|
3 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from transformers import (
|
5 |
AutoTokenizer,
|
6 |
AutoModelForCausalLM,
|
@@ -8,12 +15,6 @@ from transformers import (
|
|
8 |
AutoProcessor,
|
9 |
MusicgenForConditionalGeneration,
|
10 |
)
|
11 |
-
from scipy.io.wavfile import write
|
12 |
-
from pydub import AudioSegment
|
13 |
-
from dotenv import load_dotenv
|
14 |
-
import tempfile
|
15 |
-
import spaces
|
16 |
-
|
17 |
# Coqui TTS
|
18 |
from TTS.api import TTS
|
19 |
|
@@ -99,7 +100,7 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
|
|
99 |
f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
|
100 |
"1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
|
101 |
"2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
|
102 |
-
"3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
|
103 |
)
|
104 |
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
|
105 |
|
@@ -198,7 +199,7 @@ def generate_music(prompt: str, audio_length: int):
|
|
198 |
audio_data = outputs[0, 0].cpu().numpy()
|
199 |
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
|
200 |
|
201 |
-
output_path =
|
202 |
write(output_path, 44100, normalized_audio)
|
203 |
|
204 |
return output_path
|
@@ -229,26 +230,21 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
|
|
229 |
voice_len = len(voice) # in milliseconds
|
230 |
music_len = len(music) # in milliseconds
|
231 |
|
232 |
-
#
|
233 |
if music_len < voice_len:
|
234 |
looped_music = AudioSegment.empty()
|
235 |
-
# Keep appending until we exceed voice length
|
236 |
while len(looped_music) < voice_len:
|
237 |
looped_music += music
|
238 |
music = looped_music
|
239 |
|
240 |
-
#
|
241 |
if len(music) > voice_len:
|
242 |
music = music[:voice_len]
|
243 |
|
244 |
-
# Now music and voice are the same length
|
245 |
if ducking:
|
246 |
-
# Step 1: Reduce music dB while voice is playing
|
247 |
ducked_music = music - duck_level
|
248 |
-
# Step 2: Overlay voice on top of ducked music
|
249 |
final_audio = ducked_music.overlay(voice)
|
250 |
else:
|
251 |
-
# No ducking, just overlay
|
252 |
final_audio = music.overlay(voice)
|
253 |
|
254 |
output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
|
@@ -260,32 +256,73 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
|
|
260 |
|
261 |
|
262 |
# ---------------------------------------------------------------------
|
263 |
-
# Gradio Interface
|
264 |
# ---------------------------------------------------------------------
|
265 |
-
with gr.Blocks(
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
with gr.Tabs():
|
281 |
# Step 1: Generate Script
|
282 |
-
with gr.Tab("
|
283 |
with gr.Row():
|
284 |
user_prompt = gr.Textbox(
|
285 |
label="Promo Idea",
|
286 |
placeholder="E.g., A 30-second promo for a morning show...",
|
287 |
lines=2
|
288 |
)
|
|
|
289 |
llama_model_id = gr.Textbox(
|
290 |
label="LLaMA Model ID",
|
291 |
value="meta-llama/Meta-Llama-3-8B-Instruct",
|
@@ -298,8 +335,7 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
298 |
step=15,
|
299 |
value=30
|
300 |
)
|
301 |
-
|
302 |
-
generate_script_button = gr.Button("Generate Script")
|
303 |
script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
|
304 |
sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
|
305 |
music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
|
@@ -311,8 +347,8 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
311 |
)
|
312 |
|
313 |
# Step 2: Generate Voice
|
314 |
-
with gr.Tab("
|
315 |
-
gr.Markdown("Generate
|
316 |
selected_tts_model = gr.Dropdown(
|
317 |
label="TTS Model",
|
318 |
choices=[
|
@@ -323,7 +359,7 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
323 |
value="tts_models/en/ljspeech/tacotron2-DDC",
|
324 |
multiselect=False
|
325 |
)
|
326 |
-
generate_voice_button = gr.Button("Generate Voice-Over")
|
327 |
voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
|
328 |
|
329 |
generate_voice_button.click(
|
@@ -332,18 +368,18 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
332 |
outputs=voice_audio_output,
|
333 |
)
|
334 |
|
335 |
-
# Step 3: Generate Music
|
336 |
-
with gr.Tab("
|
337 |
-
gr.Markdown("Generate a music track
|
338 |
audio_length = gr.Slider(
|
339 |
label="Music Length (tokens)",
|
340 |
minimum=128,
|
341 |
maximum=1024,
|
342 |
step=64,
|
343 |
value=512,
|
344 |
-
info="Increase tokens for longer audio
|
345 |
)
|
346 |
-
generate_music_button = gr.Button("Generate Music")
|
347 |
music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
|
348 |
|
349 |
generate_music_button.click(
|
@@ -352,9 +388,9 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
352 |
outputs=[music_output],
|
353 |
)
|
354 |
|
355 |
-
# Step 4: Blend Audio
|
356 |
-
with gr.Tab("
|
357 |
-
gr.Markdown("
|
358 |
ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
|
359 |
duck_level_slider = gr.Slider(
|
360 |
label="Ducking Level (dB attenuation)",
|
@@ -363,7 +399,7 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
363 |
step=1,
|
364 |
value=10
|
365 |
)
|
366 |
-
blend_button = gr.Button("Blend Voice + Music")
|
367 |
blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
|
368 |
|
369 |
blend_button.click(
|
@@ -374,17 +410,21 @@ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Stu
|
|
374 |
|
375 |
# Footer
|
376 |
gr.Markdown("""
|
377 |
-
<
|
378 |
-
|
379 |
-
Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
|
380 |
-
|
|
|
|
|
381 |
""")
|
382 |
|
383 |
# Visitor Badge
|
384 |
gr.HTML("""
|
385 |
-
<
|
386 |
-
<
|
387 |
-
|
|
|
|
|
388 |
""")
|
389 |
|
390 |
demo.launch(debug=True)
|
|
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
+
import tempfile
|
4 |
+
from scipy.io.wavfile import write
|
5 |
+
from pydub import AudioSegment
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import spaces
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
# Transformers & Models
|
11 |
from transformers import (
|
12 |
AutoTokenizer,
|
13 |
AutoModelForCausalLM,
|
|
|
15 |
AutoProcessor,
|
16 |
MusicgenForConditionalGeneration,
|
17 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Coqui TTS
|
19 |
from TTS.api import TTS
|
20 |
|
|
|
100 |
f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
|
101 |
"1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
|
102 |
"2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
|
103 |
+
"3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
|
104 |
)
|
105 |
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
|
106 |
|
|
|
199 |
audio_data = outputs[0, 0].cpu().numpy()
|
200 |
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
|
201 |
|
202 |
+
output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
|
203 |
write(output_path, 44100, normalized_audio)
|
204 |
|
205 |
return output_path
|
|
|
230 |
voice_len = len(voice) # in milliseconds
|
231 |
music_len = len(music) # in milliseconds
|
232 |
|
233 |
+
# Loop music if it's shorter than voice
|
234 |
if music_len < voice_len:
|
235 |
looped_music = AudioSegment.empty()
|
|
|
236 |
while len(looped_music) < voice_len:
|
237 |
looped_music += music
|
238 |
music = looped_music
|
239 |
|
240 |
+
# Trim music if it's longer than voice
|
241 |
if len(music) > voice_len:
|
242 |
music = music[:voice_len]
|
243 |
|
|
|
244 |
if ducking:
|
|
|
245 |
ducked_music = music - duck_level
|
|
|
246 |
final_audio = ducked_music.overlay(voice)
|
247 |
else:
|
|
|
248 |
final_audio = music.overlay(voice)
|
249 |
|
250 |
output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
|
|
|
256 |
|
257 |
|
258 |
# ---------------------------------------------------------------------
|
259 |
+
# Gradio Interface with Enhanced UI
|
260 |
# ---------------------------------------------------------------------
|
261 |
+
with gr.Blocks(css="""
|
262 |
+
/* Global Styles */
|
263 |
+
body {
|
264 |
+
background: linear-gradient(135deg, #1d1f21, #3a3d41);
|
265 |
+
color: #f0f0f0;
|
266 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
267 |
+
}
|
268 |
+
.header {
|
269 |
+
text-align: center;
|
270 |
+
padding: 2rem 1rem;
|
271 |
+
background: linear-gradient(90deg, #6a11cb, #2575fc);
|
272 |
+
border-radius: 0 0 20px 20px;
|
273 |
+
margin-bottom: 2rem;
|
274 |
+
}
|
275 |
+
.header h1 {
|
276 |
+
margin: 0;
|
277 |
+
font-size: 2.5rem;
|
278 |
+
}
|
279 |
+
.header p {
|
280 |
+
font-size: 1.2rem;
|
281 |
+
}
|
282 |
+
.gradio-container {
|
283 |
+
background: #2e2e2e;
|
284 |
+
border-radius: 10px;
|
285 |
+
padding: 1rem;
|
286 |
+
}
|
287 |
+
.tab-title {
|
288 |
+
font-size: 1.1rem;
|
289 |
+
font-weight: bold;
|
290 |
+
}
|
291 |
+
.footer {
|
292 |
+
text-align: center;
|
293 |
+
font-size: 0.9em;
|
294 |
+
margin-top: 2rem;
|
295 |
+
padding: 1rem;
|
296 |
+
color: #cccccc;
|
297 |
+
}
|
298 |
+
""") as demo:
|
299 |
+
|
300 |
+
# Custom Header
|
301 |
+
with gr.Row(elem_classes="header"):
|
302 |
+
gr.Markdown("""
|
303 |
+
<h1>🎧 AI Promo Studio</h1>
|
304 |
+
<p>Your all-in-one AI solution for crafting engaging audio promos.</p>
|
305 |
+
""")
|
306 |
|
307 |
+
gr.Markdown("""
|
308 |
+
Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate:
|
309 |
+
|
310 |
+
- **Script**: Generate a compelling voice-over script with LLaMA.
|
311 |
+
- **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS.
|
312 |
+
- **Music Production**: Produce custom music tracks with MusicGen.
|
313 |
+
- **Audio Blending**: Seamlessly blend voice and music with options for ducking.
|
314 |
+
""")
|
315 |
|
316 |
with gr.Tabs():
|
317 |
# Step 1: Generate Script
|
318 |
+
with gr.Tab("📝 Script Generation"):
|
319 |
with gr.Row():
|
320 |
user_prompt = gr.Textbox(
|
321 |
label="Promo Idea",
|
322 |
placeholder="E.g., A 30-second promo for a morning show...",
|
323 |
lines=2
|
324 |
)
|
325 |
+
with gr.Row():
|
326 |
llama_model_id = gr.Textbox(
|
327 |
label="LLaMA Model ID",
|
328 |
value="meta-llama/Meta-Llama-3-8B-Instruct",
|
|
|
335 |
step=15,
|
336 |
value=30
|
337 |
)
|
338 |
+
generate_script_button = gr.Button("Generate Script", variant="primary")
|
|
|
339 |
script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
|
340 |
sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
|
341 |
music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
|
|
|
347 |
)
|
348 |
|
349 |
# Step 2: Generate Voice
|
350 |
+
with gr.Tab("🎤 Voice Synthesis"):
|
351 |
+
gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
|
352 |
selected_tts_model = gr.Dropdown(
|
353 |
label="TTS Model",
|
354 |
choices=[
|
|
|
359 |
value="tts_models/en/ljspeech/tacotron2-DDC",
|
360 |
multiselect=False
|
361 |
)
|
362 |
+
generate_voice_button = gr.Button("Generate Voice-Over", variant="primary")
|
363 |
voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
|
364 |
|
365 |
generate_voice_button.click(
|
|
|
368 |
outputs=voice_audio_output,
|
369 |
)
|
370 |
|
371 |
+
# Step 3: Generate Music
|
372 |
+
with gr.Tab("🎶 Music Production"):
|
373 |
+
gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
|
374 |
audio_length = gr.Slider(
|
375 |
label="Music Length (tokens)",
|
376 |
minimum=128,
|
377 |
maximum=1024,
|
378 |
step=64,
|
379 |
value=512,
|
380 |
+
info="Increase tokens for longer audio (inference time may vary)."
|
381 |
)
|
382 |
+
generate_music_button = gr.Button("Generate Music", variant="primary")
|
383 |
music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
|
384 |
|
385 |
generate_music_button.click(
|
|
|
388 |
outputs=[music_output],
|
389 |
)
|
390 |
|
391 |
+
# Step 4: Blend Audio
|
392 |
+
with gr.Tab("🎚️ Audio Blending"):
|
393 |
+
gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
|
394 |
ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
|
395 |
duck_level_slider = gr.Slider(
|
396 |
label="Ducking Level (dB attenuation)",
|
|
|
399 |
step=1,
|
400 |
value=10
|
401 |
)
|
402 |
+
blend_button = gr.Button("Blend Voice + Music", variant="primary")
|
403 |
blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
|
404 |
|
405 |
blend_button.click(
|
|
|
410 |
|
411 |
# Footer
|
412 |
gr.Markdown("""
|
413 |
+
<div class="footer">
|
414 |
+
<hr>
|
415 |
+
Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
|
416 |
+
<br>
|
417 |
+
<small>AI Promo Studio © 2025</small>
|
418 |
+
</div>
|
419 |
""")
|
420 |
|
421 |
# Visitor Badge
|
422 |
gr.HTML("""
|
423 |
+
<div style="text-align: center; margin-top: 1rem;">
|
424 |
+
<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
|
425 |
+
<img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" alt="visitor badge"/>
|
426 |
+
</a>
|
427 |
+
</div>
|
428 |
""")
|
429 |
|
430 |
demo.launch(debug=True)
|