Bils commited on
Commit
93b1697
·
verified ·
1 Parent(s): 2edecf4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -83
app.py CHANGED
@@ -39,6 +39,7 @@ def clean_text(text: str) -> str:
39
  """
40
  Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
41
  """
 
42
  return re.sub(r'\*', '', text)
43
 
44
  # ---------------------------------------------------------------------
@@ -50,6 +51,7 @@ def get_llama_pipeline(model_id: str, token: str):
50
  """
51
  if model_id in LLAMA_PIPELINES:
52
  return LLAMA_PIPELINES[model_id]
 
53
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
54
  model = AutoModelForCausalLM.from_pretrained(
55
  model_id,
@@ -62,6 +64,7 @@ def get_llama_pipeline(model_id: str, token: str):
62
  LLAMA_PIPELINES[model_id] = text_pipeline
63
  return text_pipeline
64
 
 
65
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
66
  """
67
  Returns a cached MusicGen model if available; otherwise, loads it.
@@ -69,23 +72,28 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
69
  """
70
  if model_key in MUSICGEN_MODELS:
71
  return MUSICGEN_MODELS[model_key]
 
72
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
73
  processor = AutoProcessor.from_pretrained(model_key)
 
74
  device = "cuda" if torch.cuda.is_available() else "cpu"
75
  model.to(device)
76
  MUSICGEN_MODELS[model_key] = (model, processor)
77
  return model, processor
78
 
 
79
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
80
  """
81
  Returns a cached TTS model if available; otherwise, loads it.
82
  """
83
  if model_name in TTS_MODELS:
84
  return TTS_MODELS[model_name]
 
85
  tts_model = TTS(model_name)
86
  TTS_MODELS[model_name] = tts_model
87
  return tts_model
88
 
 
89
  # ---------------------------------------------------------------------
90
  # Script Generation Function
91
  # ---------------------------------------------------------------------
@@ -93,10 +101,11 @@ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
93
  def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
94
  """
95
  Generates a script, sound design suggestions, and music ideas from a user prompt.
96
- Returns a tuple: (voice_script, sound_design, music_suggestions).
97
  """
98
  try:
99
  text_pipeline = get_llama_pipeline(model_id, token)
 
100
  system_prompt = (
101
  "You are an expert radio imaging producer specializing in sound design and music. "
102
  f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
@@ -118,10 +127,12 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
118
  if "Output:" in generated_text:
119
  generated_text = generated_text.split("Output:")[-1].strip()
120
 
 
121
  voice_script = "No voice-over script found."
122
  sound_design = "No sound design suggestions found."
123
  music_suggestions = "No music suggestions found."
124
 
 
125
  if "Voice-Over Script:" in generated_text:
126
  parts = generated_text.split("Voice-Over Script:")
127
  voice_script_part = parts[1]
@@ -130,6 +141,7 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
130
  else:
131
  voice_script = voice_script_part.strip()
132
 
 
133
  if "Sound Design Suggestions:" in generated_text:
134
  parts = generated_text.split("Sound Design Suggestions:")
135
  sound_design_part = parts[1]
@@ -138,6 +150,7 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
138
  else:
139
  sound_design = sound_design_part.strip()
140
 
 
141
  if "Music Suggestions:" in generated_text:
142
  parts = generated_text.split("Music Suggestions:")
143
  music_suggestions = parts[1].strip()
@@ -147,26 +160,34 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
147
  except Exception as e:
148
  return f"Error generating script: {e}", "", ""
149
 
 
150
  # ---------------------------------------------------------------------
151
  # Voice-Over Generation Function
152
  # ---------------------------------------------------------------------
153
  @spaces.GPU(duration=100)
154
  def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
155
  """
156
- Generates a voice-over from the provided script using Coqui TTS.
157
  Returns the file path to the generated .wav file.
158
  """
159
  try:
160
  if not script.strip():
161
  return "Error: No script provided."
 
 
162
  cleaned_script = clean_text(script)
 
163
  tts_model = get_tts_model(tts_model_name)
 
 
164
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
165
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
166
  return output_path
 
167
  except Exception as e:
168
  return f"Error generating voice: {e}"
169
 
 
170
  # ---------------------------------------------------------------------
171
  # Music Generation Function
172
  # ---------------------------------------------------------------------
@@ -184,28 +205,33 @@ def generate_music(prompt: str, audio_length: int):
184
  musicgen_model, musicgen_processor = get_musicgen_model(model_key)
185
 
186
  device = "cuda" if torch.cuda.is_available() else "cpu"
187
- inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt")
188
- inputs = {k: v.to(device) for k, v in inputs.items()}
189
 
190
  with torch.inference_mode():
191
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
192
 
193
  audio_data = outputs[0, 0].cpu().numpy()
194
  normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
 
195
  output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
196
  write(output_path, 44100, normalized_audio)
 
197
  return output_path
198
 
199
  except Exception as e:
200
  return f"Error generating music: {e}"
201
 
 
202
  # ---------------------------------------------------------------------
203
- # Audio Blending Function
204
  # ---------------------------------------------------------------------
205
  @spaces.GPU(duration=100)
206
  def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
207
  """
208
  Blends two audio files (voice and music).
 
 
 
209
  Returns the file path to the blended .wav file.
210
  """
211
  try:
@@ -214,19 +240,27 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
214
 
215
  voice = AudioSegment.from_wav(voice_path)
216
  music = AudioSegment.from_wav(music_path)
217
- voice_len = len(voice)
218
- music_len = len(music)
219
 
 
 
 
 
220
  if music_len < voice_len:
221
  looped_music = AudioSegment.empty()
222
  while len(looped_music) < voice_len:
223
  looped_music += music
224
  music = looped_music
225
 
 
226
  if len(music) > voice_len:
227
  music = music[:voice_len]
228
 
229
- final_audio = music.overlay(voice, gain_during_overlay=-duck_level) if ducking else music.overlay(voice)
 
 
 
 
 
230
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
231
  final_audio.export(output_path, format="wav")
232
  return output_path
@@ -234,29 +268,12 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
234
  except Exception as e:
235
  return f"Error blending audio: {e}"
236
 
237
- # ---------------------------------------------------------------------
238
- # Agent Function: Orchestrate the Full Workflow
239
- # ---------------------------------------------------------------------
240
- @spaces.GPU(duration=400)
241
- def run_agent(user_prompt: str, llama_model_id: str, duration: int, tts_model_name: str, music_length: int, ducking: bool, duck_level: int):
242
- """
243
- Runs the full workflow as an agent:
244
- 1. Generates a script (voice-over, sound design, and music suggestions).
245
- 2. Synthesizes a voice-over.
246
- 3. Generates a music track.
247
- 4. Blends the voice and music.
248
- Returns all generated components.
249
- """
250
- voice_script, sound_design, music_suggestions = generate_script(user_prompt, llama_model_id, HF_TOKEN, duration)
251
- voice_file = generate_voice(voice_script, tts_model_name)
252
- music_file = generate_music(music_suggestions, music_length)
253
- blended_file = blend_audio(voice_file, music_file, ducking, duck_level)
254
- return voice_script, sound_design, music_suggestions, voice_file, music_file, blended_file
255
 
256
  # ---------------------------------------------------------------------
257
  # Gradio Interface with Enhanced UI
258
  # ---------------------------------------------------------------------
259
  with gr.Blocks(css="""
 
260
  body {
261
  background: linear-gradient(135deg, #1d1f21, #3a3d41);
262
  color: #f0f0f0;
@@ -303,88 +320,109 @@ with gr.Blocks(css="""
303
 
304
  gr.Markdown("""
305
  Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate:
306
- - A compelling voice-over script (with sound design and music suggestions),
307
- - A natural-sounding voice-over,
308
- - Custom music tracks,
309
- - And a fully blended audio promo.
 
310
  """)
311
 
312
  with gr.Tabs():
313
- # Tab 1: Script Generation
314
  with gr.Tab("📝 Script Generation"):
315
  with gr.Row():
316
- user_prompt = gr.Textbox(label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show...", lines=2)
 
 
 
 
317
  with gr.Row():
318
- llama_model_id = gr.Textbox(label="LLaMA Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct", placeholder="Enter a valid Hugging Face model ID")
319
- duration = gr.Slider(label="Promo Duration (seconds)", minimum=15, maximum=60, step=15, value=30)
 
 
 
 
 
 
 
 
 
 
320
  generate_script_button = gr.Button("Generate Script", variant="primary")
321
- script_output = gr.Textbox(label="Voice-Over Script", lines=5, interactive=False)
322
  sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
323
  music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
324
- generate_script_button.click(fn=lambda prompt, model, dur: generate_script(prompt, model, HF_TOKEN, dur),
325
- inputs=[user_prompt, llama_model_id, duration],
326
- outputs=[script_output, sound_design_output, music_suggestion_output])
327
 
328
- # Tab 2: Voice Synthesis
 
 
 
 
 
 
329
  with gr.Tab("🎤 Voice Synthesis"):
330
  gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
331
- selected_tts_model = gr.Dropdown(label="TTS Model",
332
- choices=["tts_models/en/ljspeech/tacotron2-DDC", "tts_models/en/ljspeech/vits", "tts_models/en/sam/tacotron-DDC"],
333
- value="tts_models/en/ljspeech/tacotron2-DDC", multiselect=False)
 
 
 
 
 
 
 
334
  generate_voice_button = gr.Button("Generate Voice-Over", variant="primary")
335
  voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
336
- generate_voice_button.click(fn=lambda script, tts: generate_voice(script, tts),
337
- inputs=[script_output, selected_tts_model],
338
- outputs=voice_audio_output)
339
 
340
- # Tab 3: Music Production
 
 
 
 
 
 
341
  with gr.Tab("🎶 Music Production"):
342
- gr.Markdown("Generate a custom music track using the MusicGen Large model.")
343
- audio_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512, info="Increase tokens for longer audio (inference time may vary).")
 
 
 
 
 
 
 
344
  generate_music_button = gr.Button("Generate Music", variant="primary")
345
  music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
346
- generate_music_button.click(fn=lambda sugg, length: generate_music(sugg, length),
347
- inputs=[music_suggestion_output, audio_length],
348
- outputs=[music_output])
349
 
350
- # Tab 4: Audio Blending
 
 
 
 
 
 
351
  with gr.Tab("🎚️ Audio Blending"):
352
- gr.Markdown("Blend your voice-over and music track. Enable ducking to lower the music during voice segments.")
353
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
354
- duck_level_slider = gr.Slider(label="Ducking Level (dB attenuation)", minimum=0, maximum=20, step=1, value=10)
 
 
 
 
 
 
355
  blend_button = gr.Button("Blend Voice + Music", variant="primary")
356
  blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
357
- blend_button.click(fn=blend_audio,
358
- inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
359
- outputs=blended_output)
360
 
361
- # Tab 5: Agent – Full Workflow
362
- with gr.Tab("🤖 Agent"):
363
- gr.Markdown("Let the agent handle everything in one go: generate script, synthesize voice, produce music, and blend the final ad.")
364
- with gr.Row():
365
- agent_prompt = gr.Textbox(label="Ad Promo Idea", placeholder="Enter your ad promo concept...", lines=2)
366
- with gr.Row():
367
- agent_llama_model_id = gr.Textbox(label="LLaMA Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct", placeholder="Enter a valid Hugging Face model ID")
368
- agent_duration = gr.Slider(label="Promo Duration (seconds)", minimum=15, maximum=60, step=15, value=30)
369
- with gr.Row():
370
- agent_tts_model = gr.Dropdown(label="TTS Model",
371
- choices=["tts_models/en/ljspeech/tacotron2-DDC", "tts_models/en/ljspeech/vits", "tts_models/en/sam/tacotron-DDC"],
372
- value="tts_models/en/ljspeech/tacotron2-DDC", multiselect=False)
373
- agent_music_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
374
- with gr.Row():
375
- agent_ducking = gr.Checkbox(label="Enable Ducking?", value=True)
376
- agent_duck_level = gr.Slider(label="Ducking Level (dB attenuation)", minimum=0, maximum=20, step=1, value=10)
377
- agent_run_button = gr.Button("Run Agent", variant="primary")
378
- agent_script_output = gr.Textbox(label="Voice-Over Script", lines=5, interactive=False)
379
- agent_sound_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
380
- agent_music_suggestions_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
381
- agent_voice_audio = gr.Audio(label="Voice-Over (WAV)", type="filepath")
382
- agent_music_audio = gr.Audio(label="Generated Music (WAV)", type="filepath")
383
- agent_blended_audio = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
384
- agent_run_button.click(fn=run_agent,
385
- inputs=[agent_prompt, agent_llama_model_id, agent_duration, agent_tts_model, agent_music_length, agent_ducking, agent_duck_level],
386
- outputs=[agent_script_output, agent_sound_output, agent_music_suggestions_output, agent_voice_audio, agent_music_audio, agent_blended_audio])
387
 
 
388
  gr.Markdown("""
389
  <div class="footer">
390
  <hr>
@@ -394,6 +432,7 @@ with gr.Blocks(css="""
394
  </div>
395
  """)
396
 
 
397
  gr.HTML("""
398
  <div style="text-align: center; margin-top: 1rem;">
399
  <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
 
39
  """
40
  Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
41
  """
42
+ # Remove all asterisks. You can add more cleaning steps here as needed.
43
  return re.sub(r'\*', '', text)
44
 
45
  # ---------------------------------------------------------------------
 
51
  """
52
  if model_id in LLAMA_PIPELINES:
53
  return LLAMA_PIPELINES[model_id]
54
+
55
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
56
  model = AutoModelForCausalLM.from_pretrained(
57
  model_id,
 
64
  LLAMA_PIPELINES[model_id] = text_pipeline
65
  return text_pipeline
66
 
67
+
68
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
69
  """
70
  Returns a cached MusicGen model if available; otherwise, loads it.
 
72
  """
73
  if model_key in MUSICGEN_MODELS:
74
  return MUSICGEN_MODELS[model_key]
75
+
76
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
77
  processor = AutoProcessor.from_pretrained(model_key)
78
+
79
  device = "cuda" if torch.cuda.is_available() else "cpu"
80
  model.to(device)
81
  MUSICGEN_MODELS[model_key] = (model, processor)
82
  return model, processor
83
 
84
+
85
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
86
  """
87
  Returns a cached TTS model if available; otherwise, loads it.
88
  """
89
  if model_name in TTS_MODELS:
90
  return TTS_MODELS[model_name]
91
+
92
  tts_model = TTS(model_name)
93
  TTS_MODELS[model_name] = tts_model
94
  return tts_model
95
 
96
+
97
  # ---------------------------------------------------------------------
98
  # Script Generation Function
99
  # ---------------------------------------------------------------------
 
101
  def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
102
  """
103
  Generates a script, sound design suggestions, and music ideas from a user prompt.
104
+ Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
105
  """
106
  try:
107
  text_pipeline = get_llama_pipeline(model_id, token)
108
+
109
  system_prompt = (
110
  "You are an expert radio imaging producer specializing in sound design and music. "
111
  f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
 
127
  if "Output:" in generated_text:
128
  generated_text = generated_text.split("Output:")[-1].strip()
129
 
130
+ # Default placeholders
131
  voice_script = "No voice-over script found."
132
  sound_design = "No sound design suggestions found."
133
  music_suggestions = "No music suggestions found."
134
 
135
+ # Voice-Over Script
136
  if "Voice-Over Script:" in generated_text:
137
  parts = generated_text.split("Voice-Over Script:")
138
  voice_script_part = parts[1]
 
141
  else:
142
  voice_script = voice_script_part.strip()
143
 
144
+ # Sound Design
145
  if "Sound Design Suggestions:" in generated_text:
146
  parts = generated_text.split("Sound Design Suggestions:")
147
  sound_design_part = parts[1]
 
150
  else:
151
  sound_design = sound_design_part.strip()
152
 
153
+ # Music Suggestions
154
  if "Music Suggestions:" in generated_text:
155
  parts = generated_text.split("Music Suggestions:")
156
  music_suggestions = parts[1].strip()
 
160
  except Exception as e:
161
  return f"Error generating script: {e}", "", ""
162
 
163
+
164
  # ---------------------------------------------------------------------
165
  # Voice-Over Generation Function
166
  # ---------------------------------------------------------------------
167
  @spaces.GPU(duration=100)
168
  def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
169
  """
170
+ Generates a voice-over from the provided script using the Coqui TTS model.
171
  Returns the file path to the generated .wav file.
172
  """
173
  try:
174
  if not script.strip():
175
  return "Error: No script provided."
176
+
177
+ # Clean the script to remove special characters (e.g., asterisks) that may produce warnings
178
  cleaned_script = clean_text(script)
179
+
180
  tts_model = get_tts_model(tts_model_name)
181
+
182
+ # Generate and save voice
183
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
184
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
185
  return output_path
186
+
187
  except Exception as e:
188
  return f"Error generating voice: {e}"
189
 
190
+
191
  # ---------------------------------------------------------------------
192
  # Music Generation Function
193
  # ---------------------------------------------------------------------
 
205
  musicgen_model, musicgen_processor = get_musicgen_model(model_key)
206
 
207
  device = "cuda" if torch.cuda.is_available() else "cpu"
208
+ inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
 
209
 
210
  with torch.inference_mode():
211
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
212
 
213
  audio_data = outputs[0, 0].cpu().numpy()
214
  normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
215
+
216
  output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
217
  write(output_path, 44100, normalized_audio)
218
+
219
  return output_path
220
 
221
  except Exception as e:
222
  return f"Error generating music: {e}"
223
 
224
+
225
  # ---------------------------------------------------------------------
226
+ # Audio Blending with Duration Sync & Ducking
227
  # ---------------------------------------------------------------------
228
  @spaces.GPU(duration=100)
229
  def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
230
  """
231
  Blends two audio files (voice and music).
232
+ 1. If music < voice, loops the music until it meets/exceeds the voice duration.
233
+ 2. If music > voice, trims music to the voice duration.
234
+ 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
235
  Returns the file path to the blended .wav file.
236
  """
237
  try:
 
240
 
241
  voice = AudioSegment.from_wav(voice_path)
242
  music = AudioSegment.from_wav(music_path)
 
 
243
 
244
+ voice_len = len(voice) # in milliseconds
245
+ music_len = len(music) # in milliseconds
246
+
247
+ # Loop music if it's shorter than the voice
248
  if music_len < voice_len:
249
  looped_music = AudioSegment.empty()
250
  while len(looped_music) < voice_len:
251
  looped_music += music
252
  music = looped_music
253
 
254
+ # Trim music if it's longer than the voice
255
  if len(music) > voice_len:
256
  music = music[:voice_len]
257
 
258
+ if ducking:
259
+ ducked_music = music - duck_level
260
+ final_audio = ducked_music.overlay(voice)
261
+ else:
262
+ final_audio = music.overlay(voice)
263
+
264
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
265
  final_audio.export(output_path, format="wav")
266
  return output_path
 
268
  except Exception as e:
269
  return f"Error blending audio: {e}"
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  # ---------------------------------------------------------------------
273
  # Gradio Interface with Enhanced UI
274
  # ---------------------------------------------------------------------
275
  with gr.Blocks(css="""
276
+ /* Global Styles */
277
  body {
278
  background: linear-gradient(135deg, #1d1f21, #3a3d41);
279
  color: #f0f0f0;
 
320
 
321
  gr.Markdown("""
322
  Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate:
323
+
324
+ - **Script**: Generate a compelling voice-over script with LLaMA.
325
+ - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS.
326
+ - **Music Production**: Produce custom music tracks with MusicGen.
327
+ - **Audio Blending**: Seamlessly blend voice and music with options for ducking.
328
  """)
329
 
330
  with gr.Tabs():
331
+ # Step 1: Generate Script
332
  with gr.Tab("📝 Script Generation"):
333
  with gr.Row():
334
+ user_prompt = gr.Textbox(
335
+ label="Promo Idea",
336
+ placeholder="E.g., A 30-second promo for a morning show...",
337
+ lines=2
338
+ )
339
  with gr.Row():
340
+ llama_model_id = gr.Textbox(
341
+ label="LLaMA Model ID",
342
+ value="meta-llama/Meta-Llama-3-8B-Instruct",
343
+ placeholder="Enter a valid Hugging Face model ID"
344
+ )
345
+ duration = gr.Slider(
346
+ label="Desired Promo Duration (seconds)",
347
+ minimum=15,
348
+ maximum=60,
349
+ step=15,
350
+ value=30
351
+ )
352
  generate_script_button = gr.Button("Generate Script", variant="primary")
353
+ script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
354
  sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
355
  music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
 
 
 
356
 
357
+ generate_script_button.click(
358
+ fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
359
+ inputs=[user_prompt, llama_model_id, duration],
360
+ outputs=[script_output, sound_design_output, music_suggestion_output],
361
+ )
362
+
363
+ # Step 2: Generate Voice
364
  with gr.Tab("🎤 Voice Synthesis"):
365
  gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
366
+ selected_tts_model = gr.Dropdown(
367
+ label="TTS Model",
368
+ choices=[
369
+ "tts_models/en/ljspeech/tacotron2-DDC",
370
+ "tts_models/en/ljspeech/vits",
371
+ "tts_models/en/sam/tacotron-DDC",
372
+ ],
373
+ value="tts_models/en/ljspeech/tacotron2-DDC",
374
+ multiselect=False
375
+ )
376
  generate_voice_button = gr.Button("Generate Voice-Over", variant="primary")
377
  voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
 
 
 
378
 
379
+ generate_voice_button.click(
380
+ fn=lambda script, tts_model: generate_voice(script, tts_model),
381
+ inputs=[script_output, selected_tts_model],
382
+ outputs=voice_audio_output,
383
+ )
384
+
385
+ # Step 3: Generate Music
386
  with gr.Tab("🎶 Music Production"):
387
+ gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
388
+ audio_length = gr.Slider(
389
+ label="Music Length (tokens)",
390
+ minimum=128,
391
+ maximum=1024,
392
+ step=64,
393
+ value=512,
394
+ info="Increase tokens for longer audio (inference time may vary)."
395
+ )
396
  generate_music_button = gr.Button("Generate Music", variant="primary")
397
  music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
 
 
 
398
 
399
+ generate_music_button.click(
400
+ fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
401
+ inputs=[music_suggestion_output, audio_length],
402
+ outputs=[music_output],
403
+ )
404
+
405
+ # Step 4: Blend Audio
406
  with gr.Tab("🎚️ Audio Blending"):
407
+ gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
408
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
409
+ duck_level_slider = gr.Slider(
410
+ label="Ducking Level (dB attenuation)",
411
+ minimum=0,
412
+ maximum=20,
413
+ step=1,
414
+ value=10
415
+ )
416
  blend_button = gr.Button("Blend Voice + Music", variant="primary")
417
  blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
 
 
 
418
 
419
+ blend_button.click(
420
+ fn=blend_audio,
421
+ inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
422
+ outputs=blended_output
423
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
+ # Footer
426
  gr.Markdown("""
427
  <div class="footer">
428
  <hr>
 
432
  </div>
433
  """)
434
 
435
+ # Visitor Badge
436
  gr.HTML("""
437
  <div style="text-align: center; margin-top: 1rem;">
438
  <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">