Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -10,8 +10,9 @@ from transformers import (
|
|
10 |
)
|
11 |
from scipy.io.wavfile import write
|
12 |
from pydub import AudioSegment
|
13 |
-
from
|
14 |
import tempfile
|
|
|
15 |
import spaces
|
16 |
|
17 |
# Load environment variables
|
@@ -19,10 +20,10 @@ load_dotenv()
|
|
19 |
hf_token = os.getenv("HF_TOKEN")
|
20 |
|
21 |
# ---------------------------------------------------------------------
|
22 |
-
#
|
23 |
# ---------------------------------------------------------------------
|
24 |
@spaces.GPU(duration=300)
|
25 |
-
def generate_script(user_prompt: str, model_id: str, token: str):
|
26 |
try:
|
27 |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
|
28 |
model = AutoModelForCausalLM.from_pretrained(
|
@@ -35,21 +36,45 @@ def generate_script(user_prompt: str, model_id: str, token: str):
|
|
35 |
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
36 |
|
37 |
system_prompt = (
|
38 |
-
"You are an expert radio imaging producer specializing in sound design and music. "
|
39 |
-
"
|
|
|
40 |
)
|
41 |
|
42 |
-
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script:"
|
43 |
result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
except Exception as e:
|
46 |
-
return f"Error generating
|
47 |
|
48 |
# ---------------------------------------------------------------------
|
49 |
-
#
|
50 |
# ---------------------------------------------------------------------
|
51 |
@spaces.GPU(duration=300)
|
52 |
-
def
|
53 |
try:
|
54 |
musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
55 |
musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
@@ -62,116 +87,81 @@ def generate_audio(prompt: str, audio_length: int):
|
|
62 |
|
63 |
audio_data = outputs[0, 0].cpu().numpy()
|
64 |
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
|
|
|
65 |
output_path = f"{tempfile.gettempdir()}/generated_music.wav"
|
66 |
-
write(output_path,
|
67 |
|
68 |
return output_path
|
69 |
except Exception as e:
|
70 |
-
return f"Error generating
|
71 |
|
72 |
# ---------------------------------------------------------------------
|
73 |
-
#
|
74 |
# ---------------------------------------------------------------------
|
75 |
-
|
76 |
-
def generate_voice(script: str, language: str):
|
77 |
try:
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
voice_path = f"{tempfile.gettempdir()}/generated_voice.wav"
|
82 |
-
with open(voice_path, "wb") as f:
|
83 |
-
f.write(tts_output["audio"])
|
84 |
-
|
85 |
-
return voice_path
|
86 |
-
except Exception as e:
|
87 |
-
return f"Error generating voice-over: {e}"
|
88 |
-
|
89 |
-
# ---------------------------------------------------------------------
|
90 |
-
# Mix Audio with Ducking Option
|
91 |
-
# ---------------------------------------------------------------------
|
92 |
-
def mix_audio(voice_file, music_file, output_file, ducking: bool):
|
93 |
-
try:
|
94 |
-
voice = AudioSegment.from_file(voice_file)
|
95 |
-
music = AudioSegment.from_file(music_file)
|
96 |
|
97 |
if ducking:
|
98 |
-
music = music - 10 # Lower
|
99 |
-
combined = music.overlay(voice, position=0)
|
100 |
|
101 |
-
combined.
|
102 |
-
|
|
|
|
|
|
|
103 |
except Exception as e:
|
104 |
-
return f"Error
|
105 |
|
106 |
# ---------------------------------------------------------------------
|
107 |
-
# Gradio Interface
|
108 |
# ---------------------------------------------------------------------
|
109 |
-
def
|
110 |
-
|
|
|
|
|
111 |
|
112 |
-
|
113 |
-
|
|
|
114 |
|
115 |
-
|
116 |
-
|
|
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
return mix_audio(voice_file, music_file, output_file, ducking)
|
121 |
|
122 |
-
# ---------------------------------------------------------------------
|
123 |
-
# Interface
|
124 |
-
# ---------------------------------------------------------------------
|
125 |
with gr.Blocks() as demo:
|
126 |
-
gr.Markdown(
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
🔥 **Zero GPU** integration powered by **Hugging Face** models.
|
131 |
-
"""
|
132 |
-
)
|
133 |
-
|
134 |
-
# Step 1: Generate Script
|
135 |
-
gr.Markdown("## ✍️ Step 1: Generate Your Promo Script")
|
136 |
-
with gr.Row():
|
137 |
-
user_prompt = gr.Textbox(label="Enter Promo Idea", placeholder="E.g., A 15-second energetic jingle.", lines=2)
|
138 |
-
llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct")
|
139 |
-
generate_script_button = gr.Button("Generate Script")
|
140 |
-
script_output = gr.Textbox(label="Generated Script", lines=4, interactive=False)
|
141 |
-
|
142 |
-
# Step 2: Generate Voice-Over
|
143 |
-
gr.Markdown("## 🎤 Step 2: Generate Voice-Over")
|
144 |
-
with gr.Row():
|
145 |
-
language = gr.Dropdown(label="Select Language", choices=["en", "es", "fr", "de"], value="en")
|
146 |
-
generate_voice_button = gr.Button("Generate Voice")
|
147 |
-
voice_output = gr.Audio(label="Generated Voice", type="filepath", interactive=False)
|
148 |
|
149 |
-
# Step 3: Generate Music
|
150 |
-
gr.Markdown("## 🎵 Step 3: Generate Background Music")
|
151 |
with gr.Row():
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
-
|
157 |
-
gr.Markdown("## 🎶 Step 4: Mix Audio")
|
158 |
-
with gr.Row():
|
159 |
-
ducking = gr.Checkbox(label="Enable Ducking (lower background music volume)", value=True)
|
160 |
-
mix_audio_button = gr.Button("Mix Audio")
|
161 |
-
final_output = gr.Audio(label="Final Promo Audio", type="filepath", interactive=False)
|
162 |
-
|
163 |
-
# Button Actions
|
164 |
-
generate_script_button.click(interface_generate_script, inputs=[user_prompt, llama_model_id], outputs=script_output)
|
165 |
-
generate_voice_button.click(interface_generate_voice, inputs=[script_output, language], outputs=voice_output)
|
166 |
-
generate_audio_button.click(interface_generate_audio, inputs=[script_output, audio_length], outputs=audio_output)
|
167 |
-
mix_audio_button.click(interface_mix_audio, inputs=[voice_output, audio_output, ducking], outputs=final_output)
|
168 |
-
|
169 |
-
gr.Markdown(
|
170 |
-
"""
|
171 |
<hr>
|
172 |
-
<p style="text-align: center;
|
173 |
-
|
174 |
-
|
|
|
175 |
|
176 |
-
# Launch App
|
177 |
demo.launch(debug=True)
|
|
|
10 |
)
|
11 |
from scipy.io.wavfile import write
|
12 |
from pydub import AudioSegment
|
13 |
+
from pydub.playback import play
|
14 |
import tempfile
|
15 |
+
from dotenv import load_dotenv
|
16 |
import spaces
|
17 |
|
18 |
# Load environment variables
|
|
|
20 |
hf_token = os.getenv("HF_TOKEN")
|
21 |
|
22 |
# ---------------------------------------------------------------------
|
23 |
+
# Script Generation Function
|
24 |
# ---------------------------------------------------------------------
|
25 |
@spaces.GPU(duration=300)
|
26 |
+
def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
|
27 |
try:
|
28 |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
|
29 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
36 |
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
37 |
|
38 |
system_prompt = (
|
39 |
+
f"You are an expert radio imaging producer specializing in sound design and music. "
|
40 |
+
f"Based on the user's concept and the selected duration of {duration} seconds, craft a concise, engaging promo script. "
|
41 |
+
f"Ensure the script fits within the time limit and suggest a matching music style that complements the theme."
|
42 |
)
|
43 |
|
44 |
+
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script and music suggestion:"
|
45 |
result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
|
46 |
+
|
47 |
+
generated_text = result[0]["generated_text"].split("Refined script and music suggestion:")[-1].strip()
|
48 |
+
script, music_suggestion = generated_text.split("Music Suggestion:")
|
49 |
+
return script.strip(), music_suggestion.strip()
|
50 |
+
except Exception as e:
|
51 |
+
return f"Error generating script: {e}", None
|
52 |
+
|
53 |
+
# ---------------------------------------------------------------------
|
54 |
+
# Voice-Over Generation Function
|
55 |
+
# ---------------------------------------------------------------------
|
56 |
+
@spaces.GPU(duration=300)
|
57 |
+
def generate_voice(script: str, speaker: str):
|
58 |
+
try:
|
59 |
+
# Replace with your chosen TTS model
|
60 |
+
tts_model = "coqui/XTTS-v2"
|
61 |
+
processor = AutoProcessor.from_pretrained(tts_model)
|
62 |
+
model = AutoModelForCausalLM.from_pretrained(tts_model)
|
63 |
+
|
64 |
+
inputs = processor(script, return_tensors="pt")
|
65 |
+
speech = model.generate(**inputs)
|
66 |
+
|
67 |
+
output_path = f"{tempfile.gettempdir()}/generated_voice.wav"
|
68 |
+
write(output_path, 22050, speech.cpu().numpy())
|
69 |
+
return output_path
|
70 |
except Exception as e:
|
71 |
+
return f"Error generating voice-over: {e}"
|
72 |
|
73 |
# ---------------------------------------------------------------------
|
74 |
+
# Music Generation Function
|
75 |
# ---------------------------------------------------------------------
|
76 |
@spaces.GPU(duration=300)
|
77 |
+
def generate_music(prompt: str, audio_length: int):
|
78 |
try:
|
79 |
musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
80 |
musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
|
|
87 |
|
88 |
audio_data = outputs[0, 0].cpu().numpy()
|
89 |
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
|
90 |
+
|
91 |
output_path = f"{tempfile.gettempdir()}/generated_music.wav"
|
92 |
+
write(output_path, 44100, normalized_audio)
|
93 |
|
94 |
return output_path
|
95 |
except Exception as e:
|
96 |
+
return f"Error generating music: {e}"
|
97 |
|
98 |
# ---------------------------------------------------------------------
|
99 |
+
# Audio Blending Function with Ducking
|
100 |
# ---------------------------------------------------------------------
|
101 |
+
def blend_audio(voice_path: str, music_path: str, ducking: bool):
|
|
|
102 |
try:
|
103 |
+
voice = AudioSegment.from_file(voice_path)
|
104 |
+
music = AudioSegment.from_file(music_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
if ducking:
|
107 |
+
music = music - 10 # Lower music volume for ducking
|
|
|
108 |
|
109 |
+
combined = music.overlay(voice)
|
110 |
+
output_path = f"{tempfile.gettempdir()}/final_promo.wav"
|
111 |
+
combined.export(output_path, format="wav")
|
112 |
+
|
113 |
+
return output_path
|
114 |
except Exception as e:
|
115 |
+
return f"Error blending audio: {e}"
|
116 |
|
117 |
# ---------------------------------------------------------------------
|
118 |
+
# Gradio Interface
|
119 |
# ---------------------------------------------------------------------
|
120 |
+
def process_all(user_prompt, llama_model_id, duration, audio_length, speaker, ducking):
|
121 |
+
script, music_suggestion = generate_script(user_prompt, llama_model_id, hf_token, duration)
|
122 |
+
if "Error" in script:
|
123 |
+
return script, None
|
124 |
|
125 |
+
voice_path = generate_voice(script, speaker)
|
126 |
+
if "Error" in voice_path:
|
127 |
+
return voice_path, None
|
128 |
|
129 |
+
music_path = generate_music(music_suggestion, audio_length)
|
130 |
+
if "Error" in music_path:
|
131 |
+
return music_path, None
|
132 |
|
133 |
+
final_audio = blend_audio(voice_path, music_path, ducking)
|
134 |
+
return f"Script:\n{script}\n\nMusic Suggestion:\n{music_suggestion}", final_audio
|
|
|
135 |
|
|
|
|
|
|
|
136 |
with gr.Blocks() as demo:
|
137 |
+
gr.Markdown("""
|
138 |
+
# 🎧 AI Promo Studio with Script, Voice, Music, and Mixing 🚀
|
139 |
+
Generate fully mixed promos effortlessly with AI-driven tools for radio and media!
|
140 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
|
|
|
|
142 |
with gr.Row():
|
143 |
+
user_prompt = gr.Textbox(label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show.")
|
144 |
+
llama_model_id = gr.Textbox(label="Llama Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct")
|
145 |
+
duration = gr.Slider(label="Duration (seconds)", minimum=15, maximum=60, step=15, value=30)
|
146 |
+
audio_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
|
147 |
+
speaker = gr.Textbox(label="Voice Style (optional)", placeholder="E.g., male, female, or neutral.")
|
148 |
+
ducking = gr.Checkbox(label="Enable Ducking", value=True)
|
149 |
+
|
150 |
+
generate_button = gr.Button("Generate Full Promo")
|
151 |
+
script_output = gr.Textbox(label="Generated Script and Music Suggestion")
|
152 |
+
audio_output = gr.Audio(label="Final Promo Audio", type="filepath")
|
153 |
+
|
154 |
+
generate_button.click(
|
155 |
+
fn=process_all,
|
156 |
+
inputs=[user_prompt, llama_model_id, duration, audio_length, speaker, ducking],
|
157 |
+
outputs=[script_output, audio_output],
|
158 |
+
)
|
159 |
|
160 |
+
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
<hr>
|
162 |
+
<p style="text-align: center; font-size: 0.9em;">
|
163 |
+
Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
|
164 |
+
</p>
|
165 |
+
""")
|
166 |
|
|
|
167 |
demo.launch(debug=True)
|