Bils commited on
Commit
3e34a93
·
verified ·
1 Parent(s): 622c89a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -266
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
  import os
3
  import torch
4
- import numpy as np
5
- import matplotlib.pyplot as plt
6
  from transformers import (
7
  AutoTokenizer,
8
  AutoModelForCausalLM,
@@ -15,308 +13,378 @@ from pydub import AudioSegment
15
  from dotenv import load_dotenv
16
  import tempfile
17
  import spaces
 
 
18
  from TTS.api import TTS
19
- import psutil
20
- import GPUtil
21
 
22
- # -------------------------------
23
- # Configuration
24
- # -------------------------------
25
  load_dotenv()
26
- HF_TOKEN = os.getenv("HF_TOKEN", os.getenv("HF_TOKEN_SECRET"))
27
-
28
- MODEL_CONFIG = {
29
- "llama_models": {
30
- "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
31
- "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
32
- },
33
- "tts_models": {
34
- "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
35
- "High Quality": "tts_models/en/ljspeech/vits"
36
- },
37
- "musicgen_model": "facebook/musicgen-medium"
38
- }
39
-
40
- # -------------------------------
41
- # Model Manager with Cache
42
- # -------------------------------
43
-
44
- class ModelManager:
45
- def __init__(self):
46
- self.llama_pipelines = {}
47
- self.musicgen_model = None
48
- self.tts_models = {}
49
- self.processor = None # Add processor cache
50
-
51
- def get_llama_pipeline(self, model_id, token):
52
- if model_id not in self.llama_pipelines:
53
- tokenizer = AutoTokenizer.from_pretrained(
54
- model_id,
55
- token=token,
56
- legacy=False
57
- )
58
- model = AutoModelForCausalLM.from_pretrained(
59
- model_id,
60
- token=token,
61
- torch_dtype=torch.float16,
62
- device_map="auto",
63
- low_cpu_mem_usage=True
64
- )
65
- self.llama_pipelines[model_id] = pipeline(
66
- "text-generation",
67
- model=model,
68
- tokenizer=tokenizer,
69
- device_map="auto"
70
- )
71
- return self.llama_pipelines[model_id]
72
 
73
- def get_musicgen_model(self):
74
- if not self.musicgen_model:
75
- self.musicgen_model = MusicgenForConditionalGeneration.from_pretrained(
76
- MODEL_CONFIG["musicgen_model"]
77
- )
78
- self.processor = AutoProcessor.from_pretrained(MODEL_CONFIG["musicgen_model"])
79
- self.musicgen_model.to("cuda" if torch.cuda.is_available() else "cpu")
80
- return self.musicgen_model, self.processor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- model_manager = ModelManager()
 
83
 
84
- # -------------------------------
85
- # Core Functions with Enhanced Error Handling
86
- # -------------------------------
87
- @spaces.GPU
88
- def generate_script(user_prompt, model_id, duration, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  try:
90
- progress(0.1, "Initializing script generation...")
91
- text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
92
-
93
- system_prompt = f"""Generate a {duration}-second radio promo with:
94
- 1. Voice Script: [Clear narration, 25-35 words]
95
- 2. Sound Design: [3-5 specific sound effects]
96
- 3. Music: [Genre, tempo, mood]
97
-
98
- Format strictly as:
99
- Voice Script: [content]
100
- Sound Design: [effects]
101
- Music: [description]"""
102
-
103
- progress(0.3, "Generating content...")
104
- response = text_pipeline(
105
- f"{system_prompt}\nConcept: {user_prompt}",
106
- max_new_tokens=300,
107
- temperature=0.7,
108
- do_sample=True,
109
- top_p=0.95
110
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- progress(0.8, "Parsing results...")
113
- return parse_generated_content(response[0]["generated_text"])
114
  except Exception as e:
115
- return [f"Error: {str(e)}"] * 3
116
 
117
- def parse_generated_content(text):
118
- sections = {"Voice Script": "", "Sound Design": "", "Music": ""}
119
- current_section = None
120
-
121
- for line in text.split('\n'):
122
- line = line.strip()
123
- for section in sections:
124
- if line.startswith(section + ":"):
125
- current_section = section
126
- line = line.replace(section + ":", "").strip()
127
- break
128
- if current_section and line:
129
- sections[current_section] += line + "\n"
130
-
131
- return [sections[section].strip() for section in sections]
132
 
133
- @spaces.GPU
134
- def generate_voice(script, tts_model, speed=1.0, progress=gr.Progress()):
 
 
 
 
 
 
 
135
  try:
136
- progress(0.2, "Initializing TTS...")
137
  if not script.strip():
138
- return None, "No script provided"
139
-
140
- tts = model_manager.get_tts_model(tts_model)
141
- output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
142
-
143
- progress(0.5, "Generating audio...")
144
- tts.tts_to_file(text=script, file_path=output_path, speed=speed)
145
-
146
- return output_path, None
147
  except Exception as e:
148
- return None, f"Voice Error: {str(e)}"
 
149
 
150
- @spaces.GPU
151
- def generate_music(prompt, duration_sec=30, progress=gr.Progress()):
 
 
 
 
 
 
 
152
  try:
153
- progress(0.1, "Initializing MusicGen...")
154
- model = model_manager.get_musicgen_model()
155
- processor = AutoProcessor.from_pretrained(MODEL_CONFIG["musicgen_model"])
156
-
157
- progress(0.4, "Processing input...")
158
- inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(model.device)
159
-
160
- progress(0.6, "Generating music...")
161
- audio_values = model.generate(**inputs, max_new_tokens=int(duration_sec * 50))
162
-
163
- output_path = os.path.join(tempfile.gettempdir(), "music.wav")
164
- write(output_path, 32000, audio_values[0, 0].cpu().numpy())
165
- return output_path, None
 
 
 
 
 
 
 
166
  except Exception as e:
167
- return None, f"Music Error: {str(e)}"
 
168
 
169
- def blend_audio(voice_path, music_path, ducking=True, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
170
  try:
171
- progress(0.2, "Loading audio files...")
 
 
172
  voice = AudioSegment.from_wav(voice_path)
173
  music = AudioSegment.from_wav(music_path)
174
 
175
- progress(0.4, "Aligning durations...")
176
- if len(music) < len(voice):
177
- music = music * (len(voice) // len(music) + 1)
178
- music = music[:len(voice)]
 
 
 
 
 
 
179
 
180
- progress(0.6, "Mixing audio...")
 
 
 
 
181
  if ducking:
182
- music = music - 10 # 10dB ducking
 
 
 
 
 
 
 
 
 
 
183
 
184
- mixed = music.overlay(voice)
185
- output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
186
- mixed.export(output_path, format="wav")
187
- return output_path, None
188
  except Exception as e:
189
- return None, f"Mixing Error: {str(e)}"
190
-
191
- # -------------------------------
192
- # UI Components
193
- # -------------------------------
194
- def create_audio_visualization(audio_path):
195
- if not audio_path:
196
- return None
197
- audio = AudioSegment.from_file(audio_path)
198
- samples = np.array(audio.get_array_of_samples())
199
-
200
- plt.figure(figsize=(10, 3))
201
- plt.plot(samples)
202
- plt.axis('off')
203
- plt.tight_layout()
204
-
205
- temp_file = os.path.join(tempfile.gettempdir(), "waveform.png")
206
- plt.savefig(temp_file, bbox_inches='tight', pad_inches=0)
207
- plt.close()
208
- return temp_file
209
-
210
- def system_monitor():
211
- gpus = GPUtil.getGPUs()
212
- return {
213
- "CPU": f"{psutil.cpu_percent()}%",
214
- "RAM": f"{psutil.virtual_memory().percent}%",
215
- "GPU": f"{gpus[0].load*100 if gpus else 0:.1f}%" if gpus else "N/A"
216
- }
217
-
218
- # -------------------------------
219
  # Gradio Interface
220
- # -------------------------------
221
- theme = gr.themes.Soft(
222
- primary_hue="blue",
223
- secondary_hue="teal",
224
- ).set(
225
- body_text_color_dark='#FFFFFF',
226
- background_fill_primary_dark='#1F1F1F'
227
- )
 
 
 
 
 
 
228
 
229
- with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo:
230
- gr.Markdown("# 🎙️ AI Radio Studio Pro")
231
-
232
- with gr.Row():
233
- with gr.Column(scale=3):
234
- concept_input = gr.Textbox(
235
- label="Concept Description",
236
- placeholder="Describe your radio segment...",
237
- lines=3
238
- )
239
- with gr.Accordion("Advanced Settings", open=False):
240
- model_selector = gr.Dropdown(
241
- list(MODEL_CONFIG["llama_models"].values()),
242
- label="AI Model",
243
- value=next(iter(MODEL_CONFIG["llama_models"].values()))
244
- )
245
- duration_selector = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)")
246
-
247
- generate_btn = gr.Button("Generate Script", variant="primary")
248
-
249
- with gr.Column(scale=2):
250
- script_output = gr.Textbox(label="Voice Script", interactive=True)
251
- sound_output = gr.Textbox(label="Sound Design", interactive=True)
252
- music_output = gr.Textbox(label="Music Style", interactive=True)
253
 
254
  with gr.Tabs():
255
- with gr.Tab("🎤 Voice Production"):
 
256
  with gr.Row():
257
- tts_selector = gr.Dropdown(
258
- list(MODEL_CONFIG["tts_models"].values()),
259
- label="Voice Model",
260
- value=next(iter(MODEL_CONFIG["tts_models"].values()))
 
 
 
 
 
 
 
 
 
 
 
 
261
  )
262
- speed_selector = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate")
263
- voice_btn = gr.Button("Generate Voiceover", variant="primary")
264
- with gr.Row():
265
- voice_audio = gr.Audio(label="Voice Preview", interactive=False)
266
- voice_viz = gr.Image(label="Waveform", interactive=False)
267
 
268
- with gr.Tab("🎵 Music Production"):
269
- music_btn = gr.Button("Generate Music Track", variant="primary")
270
- with gr.Row():
271
- music_audio = gr.Audio(label="Music Preview", interactive=False)
272
- music_viz = gr.Image(label="Waveform", interactive=False)
273
 
274
- with gr.Tab("🔉 Final Mix"):
275
- mix_btn = gr.Button("Create Final Mix", variant="primary")
276
- with gr.Row():
277
- final_mix_audio = gr.Audio(label="Final Mix", interactive=False)
278
- final_mix_viz = gr.Image(label="Waveform", interactive=False)
279
- with gr.Row():
280
- download_btn = gr.Button("Download Mix")
281
- play_btn = gr.Button("▶️ Play in Browser")
282
 
283
- with gr.Accordion("📊 System Monitor", open=False):
284
- monitor = gr.JSON(label="Resource Usage", value=lambda: system_monitor(), every=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  gr.Markdown("""
287
- <div style="text-align: center; padding: 20px; border-top: 1px solid #444;">
288
- <p>Created with ❤️ by <a href="https://bilsimaging.com">Bils Imaging</a></p>
289
- <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/radiogold&countColor=%23263759">
290
- </div>
291
  """)
292
-
293
- # Event Handling
294
- generate_btn.click(
295
- generate_script,
296
- [concept_input, model_selector, duration_selector],
297
- [script_output, sound_output, music_output]
298
- )
299
-
300
- voice_btn.click(
301
- generate_voice,
302
- [script_output, tts_selector, speed_selector],
303
- [voice_audio, voice_viz],
304
- preprocess=create_audio_visualization
305
- )
306
 
307
- music_btn.click(
308
- generate_music,
309
- [music_output],
310
- [music_audio, music_viz],
311
- preprocess=create_audio_visualization
312
- )
313
-
314
- mix_btn.click(
315
- blend_audio,
316
- [voice_audio, music_audio],
317
- [final_mix_audio, final_mix_viz],
318
- preprocess=create_audio_visualization
319
- )
320
 
321
- if __name__ == "__main__":
322
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import os
3
  import torch
 
 
4
  from transformers import (
5
  AutoTokenizer,
6
  AutoModelForCausalLM,
 
13
  from dotenv import load_dotenv
14
  import tempfile
15
  import spaces
16
+
17
+ # Coqui TTS
18
  from TTS.api import TTS
 
 
19
 
20
+ # ---------------------------------------------------------------------
21
+ # Load Environment Variables
22
+ # ---------------------------------------------------------------------
23
  load_dotenv()
24
+ HF_TOKEN = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # ---------------------------------------------------------------------
27
+ # Global Model Caches
28
+ # ---------------------------------------------------------------------
29
+ LLAMA_PIPELINES = {}
30
+ MUSICGEN_MODELS = {}
31
+ TTS_MODELS = {}
32
+
33
+ # ---------------------------------------------------------------------
34
+ # Helper Functions
35
+ # ---------------------------------------------------------------------
36
+ def get_llama_pipeline(model_id: str, token: str):
37
+ """
38
+ Returns a cached LLaMA pipeline if available; otherwise, loads it.
39
+ """
40
+ if model_id in LLAMA_PIPELINES:
41
+ return LLAMA_PIPELINES[model_id]
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ use_auth_token=token,
47
+ torch_dtype=torch.float16,
48
+ device_map="auto",
49
+ trust_remote_code=True,
50
+ )
51
+ text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
52
+ LLAMA_PIPELINES[model_id] = text_pipeline
53
+ return text_pipeline
54
+
55
+
56
+ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
57
+ """
58
+ Returns a cached MusicGen model if available; otherwise, loads it.
59
+ Uses the 'large' variant for higher quality outputs.
60
+ """
61
+ if model_key in MUSICGEN_MODELS:
62
+ return MUSICGEN_MODELS[model_key]
63
 
64
+ model = MusicgenForConditionalGeneration.from_pretrained(model_key)
65
+ processor = AutoProcessor.from_pretrained(model_key)
66
 
67
+ device = "cuda" if torch.cuda.is_available() else "cpu"
68
+ model.to(device)
69
+ MUSICGEN_MODELS[model_key] = (model, processor)
70
+ return model, processor
71
+
72
+
73
+ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
74
+ """
75
+ Returns a cached TTS model if available; otherwise, loads it.
76
+ """
77
+ if model_name in TTS_MODELS:
78
+ return TTS_MODELS[model_name]
79
+
80
+ tts_model = TTS(model_name)
81
+ TTS_MODELS[model_name] = tts_model
82
+ return tts_model
83
+
84
+
85
+ # ---------------------------------------------------------------------
86
+ # Script Generation Function
87
+ # ---------------------------------------------------------------------
88
+ @spaces.GPU(duration=100)
89
+ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
90
+ """
91
+ Generates a script, sound design suggestions, and music ideas from a user prompt.
92
+ Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
93
+ """
94
  try:
95
+ text_pipeline = get_llama_pipeline(model_id, token)
96
+
97
+ system_prompt = (
98
+ "You are an expert radio imaging producer specializing in sound design and music. "
99
+ f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
100
+ "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
101
+ "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
102
+ "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
 
 
 
 
 
 
 
 
 
 
 
 
103
  )
104
+ combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
105
+
106
+ with torch.inference_mode():
107
+ result = text_pipeline(
108
+ combined_prompt,
109
+ max_new_tokens=300,
110
+ do_sample=True,
111
+ temperature=0.8
112
+ )
113
+
114
+ generated_text = result[0]["generated_text"]
115
+ if "Output:" in generated_text:
116
+ generated_text = generated_text.split("Output:")[-1].strip()
117
+
118
+ # Default placeholders
119
+ voice_script = "No voice-over script found."
120
+ sound_design = "No sound design suggestions found."
121
+ music_suggestions = "No music suggestions found."
122
+
123
+ # Voice-Over Script
124
+ if "Voice-Over Script:" in generated_text:
125
+ parts = generated_text.split("Voice-Over Script:")
126
+ voice_script_part = parts[1]
127
+ if "Sound Design Suggestions:" in voice_script_part:
128
+ voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
129
+ else:
130
+ voice_script = voice_script_part.strip()
131
+
132
+ # Sound Design
133
+ if "Sound Design Suggestions:" in generated_text:
134
+ parts = generated_text.split("Sound Design Suggestions:")
135
+ sound_design_part = parts[1]
136
+ if "Music Suggestions:" in sound_design_part:
137
+ sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
138
+ else:
139
+ sound_design = sound_design_part.strip()
140
+
141
+ # Music Suggestions
142
+ if "Music Suggestions:" in generated_text:
143
+ parts = generated_text.split("Music Suggestions:")
144
+ music_suggestions = parts[1].strip()
145
+
146
+ return voice_script, sound_design, music_suggestions
147
 
 
 
148
  except Exception as e:
149
+ return f"Error generating script: {e}", "", ""
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ # ---------------------------------------------------------------------
153
+ # Voice-Over Generation Function
154
+ # ---------------------------------------------------------------------
155
+ @spaces.GPU(duration=100)
156
+ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
157
+ """
158
+ Generates a voice-over from the provided script using the Coqui TTS model.
159
+ Returns the file path to the generated .wav file.
160
+ """
161
  try:
 
162
  if not script.strip():
163
+ return "Error: No script provided."
164
+
165
+ tts_model = get_tts_model(tts_model_name)
166
+
167
+ # Generate and save voice
168
+ output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
169
+ tts_model.tts_to_file(text=script, file_path=output_path)
170
+ return output_path
171
+
172
  except Exception as e:
173
+ return f"Error generating voice: {e}"
174
+
175
 
176
+ # ---------------------------------------------------------------------
177
+ # Music Generation Function
178
+ # ---------------------------------------------------------------------
179
+ @spaces.GPU(duration=100)
180
+ def generate_music(prompt: str, audio_length: int):
181
+ """
182
+ Generates music from the 'facebook/musicgen-large' model based on the prompt.
183
+ Returns the file path to the generated .wav file.
184
+ """
185
  try:
186
+ if not prompt.strip():
187
+ return "Error: No music suggestion provided."
188
+
189
+ model_key = "facebook/musicgen-large"
190
+ musicgen_model, musicgen_processor = get_musicgen_model(model_key)
191
+
192
+ device = "cuda" if torch.cuda.is_available() else "cpu"
193
+ inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
194
+
195
+ with torch.inference_mode():
196
+ outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
197
+
198
+ audio_data = outputs[0, 0].cpu().numpy()
199
+ normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
200
+
201
+ output_path = f"{tempfile.gettempdir()}/musicgen_large_generated_music.wav"
202
+ write(output_path, 44100, normalized_audio)
203
+
204
+ return output_path
205
+
206
  except Exception as e:
207
+ return f"Error generating music: {e}"
208
+
209
 
210
+ # ---------------------------------------------------------------------
211
+ # Audio Blending with Duration Sync & Ducking
212
+ # ---------------------------------------------------------------------
213
+ @spaces.GPU(duration=100)
214
+ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
215
+ """
216
+ Blends two audio files (voice and music).
217
+ 1. If music < voice, loops the music until it meets/exceeds the voice duration.
218
+ 2. If music > voice, trims music to the voice duration.
219
+ 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
220
+ Returns the file path to the blended .wav file.
221
+ """
222
  try:
223
+ if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
224
+ return "Error: Missing audio files for blending."
225
+
226
  voice = AudioSegment.from_wav(voice_path)
227
  music = AudioSegment.from_wav(music_path)
228
 
229
+ voice_len = len(voice) # in milliseconds
230
+ music_len = len(music) # in milliseconds
231
+
232
+ # 1) If the music is shorter than the voice, loop it:
233
+ if music_len < voice_len:
234
+ looped_music = AudioSegment.empty()
235
+ # Keep appending until we exceed voice length
236
+ while len(looped_music) < voice_len:
237
+ looped_music += music
238
+ music = looped_music
239
 
240
+ # 2) If the music is longer than the voice, truncate it:
241
+ if len(music) > voice_len:
242
+ music = music[:voice_len]
243
+
244
+ # Now music and voice are the same length
245
  if ducking:
246
+ # Step 1: Reduce music dB while voice is playing
247
+ ducked_music = music - duck_level
248
+ # Step 2: Overlay voice on top of ducked music
249
+ final_audio = ducked_music.overlay(voice)
250
+ else:
251
+ # No ducking, just overlay
252
+ final_audio = music.overlay(voice)
253
+
254
+ output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
255
+ final_audio.export(output_path, format="wav")
256
+ return output_path
257
 
 
 
 
 
258
  except Exception as e:
259
+ return f"Error blending audio: {e}"
260
+
261
+
262
+ # ---------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  # Gradio Interface
264
+ # ---------------------------------------------------------------------
265
+ with gr.Blocks() as demo:
266
+ gr.Markdown("""
267
+ # 🎧 AI Promo Studio
268
+ Welcome to **AI Promo Studio**, your all-in-one solution for creating professional, engaging audio promos with minimal effort!
269
+
270
+ This next-generation platform uses powerful AI models to handle:
271
+ - **Script Generation**: Craft concise and impactful copy with LLaMA.
272
+ - **Voice Synthesis**: Convert text into natural-sounding voice-overs using Coqui TTS.
273
+ - **Music Production**: Generate custom music tracks with MusicGen Large for sound bed.
274
+ - **Seamless Blending**: Easily combine voice and music—loop or trim tracks to match your desired promo length, with optional ducking to keep the voice front and center.
275
+
276
+ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Studio** streamlines your entire production pipeline—cutting hours of manual editing down to a few clicks.
277
+ """)
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  with gr.Tabs():
281
+ # Step 1: Generate Script
282
+ with gr.Tab("Step 1: Generate Script"):
283
  with gr.Row():
284
+ user_prompt = gr.Textbox(
285
+ label="Promo Idea",
286
+ placeholder="E.g., A 30-second promo for a morning show...",
287
+ lines=2
288
+ )
289
+ llama_model_id = gr.Textbox(
290
+ label="LLaMA Model ID",
291
+ value="meta-llama/Meta-Llama-3-8B-Instruct",
292
+ placeholder="Enter a valid Hugging Face model ID"
293
+ )
294
+ duration = gr.Slider(
295
+ label="Desired Promo Duration (seconds)",
296
+ minimum=15,
297
+ maximum=60,
298
+ step=15,
299
+ value=30
300
  )
 
 
 
 
 
301
 
302
+ generate_script_button = gr.Button("Generate Script")
303
+ script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
304
+ sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
305
+ music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
 
306
 
307
+ generate_script_button.click(
308
+ fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
309
+ inputs=[user_prompt, llama_model_id, duration],
310
+ outputs=[script_output, sound_design_output, music_suggestion_output],
311
+ )
 
 
 
312
 
313
+ # Step 2: Generate Voice
314
+ with gr.Tab("Step 2: Generate Voice"):
315
+ gr.Markdown("Generate the voice-over using a Coqui TTS model.")
316
+ selected_tts_model = gr.Dropdown(
317
+ label="TTS Model",
318
+ choices=[
319
+ "tts_models/en/ljspeech/tacotron2-DDC",
320
+ "tts_models/en/ljspeech/vits",
321
+ "tts_models/en/sam/tacotron-DDC",
322
+ ],
323
+ value="tts_models/en/ljspeech/tacotron2-DDC",
324
+ multiselect=False
325
+ )
326
+ generate_voice_button = gr.Button("Generate Voice-Over")
327
+ voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
328
+
329
+ generate_voice_button.click(
330
+ fn=lambda script, tts_model: generate_voice(script, tts_model),
331
+ inputs=[script_output, selected_tts_model],
332
+ outputs=voice_audio_output,
333
+ )
334
+
335
+ # Step 3: Generate Music (MusicGen Large)
336
+ with gr.Tab("Step 3: Generate Music"):
337
+ gr.Markdown("Generate a music track with the **MusicGen Large** model.")
338
+ audio_length = gr.Slider(
339
+ label="Music Length (tokens)",
340
+ minimum=128,
341
+ maximum=1024,
342
+ step=64,
343
+ value=512,
344
+ info="Increase tokens for longer audio, but be mindful of inference time."
345
+ )
346
+ generate_music_button = gr.Button("Generate Music")
347
+ music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
348
 
349
+ generate_music_button.click(
350
+ fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
351
+ inputs=[music_suggestion_output, audio_length],
352
+ outputs=[music_output],
353
+ )
354
+
355
+ # Step 4: Blend Audio (Loop/Trim + Ducking)
356
+ with gr.Tab("Step 4: Blend Audio"):
357
+ gr.Markdown("**Music** will be looped or trimmed to match **Voice** duration, then optionally ducked.")
358
+ ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
359
+ duck_level_slider = gr.Slider(
360
+ label="Ducking Level (dB attenuation)",
361
+ minimum=0,
362
+ maximum=20,
363
+ step=1,
364
+ value=10
365
+ )
366
+ blend_button = gr.Button("Blend Voice + Music")
367
+ blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
368
+
369
+ blend_button.click(
370
+ fn=blend_audio,
371
+ inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
372
+ outputs=blended_output
373
+ )
374
+
375
+ # Footer
376
  gr.Markdown("""
377
+ <hr>
378
+ <p style="text-align: center; font-size: 0.9em;">
379
+ Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
380
+ </p>
381
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ # Visitor Badge
384
+ gr.HTML("""
385
+ <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
386
+ <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" />
387
+ </a>
388
+ """)
 
 
 
 
 
 
 
389
 
390
+ demo.launch(debug=True)