import os import re import torch import tempfile from scipy.io.wavfile import write from pydub import AudioSegment from dotenv import load_dotenv import spaces import gradio as gr # Transformers & Models from transformers import ( AutoTokenizer, AutoModelForCausalLM, pipeline, AutoProcessor, MusicgenForConditionalGeneration, ) # Coqui TTS from TTS.api import TTS # --------------------------------------------------------------------- # Load Environment Variables # --------------------------------------------------------------------- load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") # --------------------------------------------------------------------- # Global Model Caches # --------------------------------------------------------------------- LLAMA_PIPELINES = {} MUSICGEN_MODELS = {} TTS_MODELS = {} # --------------------------------------------------------------------- # Utility Function: Clean Text # --------------------------------------------------------------------- def clean_text(text: str) -> str: """ Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary. """ # Remove all asterisks. You can add more cleaning steps here as needed. return re.sub(r'\*', '', text) # --------------------------------------------------------------------- # Helper Functions # --------------------------------------------------------------------- def get_llama_pipeline(model_id: str, token: str): """ Returns a cached LLaMA pipeline if available; otherwise, loads it. """ if model_id in LLAMA_PIPELINES: return LLAMA_PIPELINES[model_id] tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token) model = AutoModelForCausalLM.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) LLAMA_PIPELINES[model_id] = text_pipeline return text_pipeline def get_musicgen_model(model_key: str = "facebook/musicgen-large"): """ Returns a cached MusicGen model if available; otherwise, loads it. Uses the 'large' variant for higher quality outputs. """ if model_key in MUSICGEN_MODELS: return MUSICGEN_MODELS[model_key] model = MusicgenForConditionalGeneration.from_pretrained(model_key) processor = AutoProcessor.from_pretrained(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) MUSICGEN_MODELS[model_key] = (model, processor) return model, processor def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Returns a cached TTS model if available; otherwise, loads it. """ if model_name in TTS_MODELS: return TTS_MODELS[model_name] tts_model = TTS(model_name) TTS_MODELS[model_name] = tts_model return tts_model # --------------------------------------------------------------------- # Script Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_script(user_prompt: str, model_id: str, token: str, duration: int): """ Generates a script, sound design suggestions, and music ideas from a user prompt. Returns a tuple of strings: (voice_script, sound_design, music_suggestions). """ try: text_pipeline = get_llama_pipeline(model_id, token) system_prompt = ( "You are an expert radio imaging producer specializing in sound design and music. " f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: " "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n" "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n" "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'." ) combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:" with torch.inference_mode(): result = text_pipeline( combined_prompt, max_new_tokens=300, do_sample=True, temperature=0.8 ) generated_text = result[0]["generated_text"] if "Output:" in generated_text: generated_text = generated_text.split("Output:")[-1].strip() # Default placeholders voice_script = "No voice-over script found." sound_design = "No sound design suggestions found." music_suggestions = "No music suggestions found." # Voice-Over Script if "Voice-Over Script:" in generated_text: parts = generated_text.split("Voice-Over Script:") voice_script_part = parts[1] if "Sound Design Suggestions:" in voice_script_part: voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip() else: voice_script = voice_script_part.strip() # Sound Design if "Sound Design Suggestions:" in generated_text: parts = generated_text.split("Sound Design Suggestions:") sound_design_part = parts[1] if "Music Suggestions:" in sound_design_part: sound_design = sound_design_part.split("Music Suggestions:")[0].strip() else: sound_design = sound_design_part.strip() # Music Suggestions if "Music Suggestions:" in generated_text: parts = generated_text.split("Music Suggestions:") music_suggestions = parts[1].strip() return voice_script, sound_design, music_suggestions except Exception as e: return f"Error generating script: {e}", "", "" # --------------------------------------------------------------------- # Voice-Over Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Generates a voice-over from the provided script using the Coqui TTS model. Returns the file path to the generated .wav file. """ try: if not script.strip(): return "Error: No script provided." # Clean the script to remove special characters (e.g., asterisks) that may produce warnings cleaned_script = clean_text(script) tts_model = get_tts_model(tts_model_name) # Generate and save voice output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav") tts_model.tts_to_file(text=cleaned_script, file_path=output_path) return output_path except Exception as e: return f"Error generating voice: {e}" # --------------------------------------------------------------------- # Music Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=200) def generate_music(prompt: str, audio_length: int): """ Generates music from the 'facebook/musicgen-large' model based on the prompt. Returns the file path to the generated .wav file. """ try: if not prompt.strip(): return "Error: No music suggestion provided." model_key = "facebook/musicgen-large" musicgen_model, musicgen_processor = get_musicgen_model(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device) with torch.inference_mode(): outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length) audio_data = outputs[0, 0].cpu().numpy() normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16") output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav") write(output_path, 44100, normalized_audio) return output_path except Exception as e: return f"Error generating music: {e}" # --------------------------------------------------------------------- # Audio Blending with Duration Sync & Ducking # --------------------------------------------------------------------- @spaces.GPU(duration=100) def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10): """ Blends two audio files (voice and music). 1. If music < voice, loops the music until it meets/exceeds the voice duration. 2. If music > voice, trims music to the voice duration. 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing. Returns the file path to the blended .wav file. """ try: if not os.path.isfile(voice_path) or not os.path.isfile(music_path): return "Error: Missing audio files for blending." voice = AudioSegment.from_wav(voice_path) music = AudioSegment.from_wav(music_path) voice_len = len(voice) # in milliseconds music_len = len(music) # in milliseconds # Loop music if it's shorter than the voice if music_len < voice_len: looped_music = AudioSegment.empty() while len(looped_music) < voice_len: looped_music += music music = looped_music # Trim music if it's longer than the voice if len(music) > voice_len: music = music[:voice_len] if ducking: ducked_music = music - duck_level final_audio = ducked_music.overlay(voice) else: final_audio = music.overlay(voice) output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav") final_audio.export(output_path, format="wav") return output_path except Exception as e: return f"Error blending audio: {e}" # --------------------------------------------------------------------- # Gradio Interface with Enhanced UI # --------------------------------------------------------------------- with gr.Blocks(css=""" /* Global Styles */ body { background: linear-gradient(135deg, #1d1f21, #3a3d41); color: #f0f0f0; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .header { text-align: center; padding: 2rem 1rem; background: linear-gradient(90deg, #6a11cb, #2575fc); border-radius: 0 0 20px 20px; margin-bottom: 2rem; } .header h1 { margin: 0; font-size: 2.5rem; } .header p { font-size: 1.2rem; } .gradio-container { background: #2e2e2e; border-radius: 10px; padding: 1rem; } .tab-title { font-size: 1.1rem; font-weight: bold; } .footer { text-align: center; font-size: 0.9em; margin-top: 2rem; padding: 1rem; color: #cccccc; } """) as demo: # Custom Header with gr.Row(elem_classes="header"): gr.Markdown("""
Your all-in-one AI solution for crafting engaging audio promos.
""") gr.Markdown(""" Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate: - **Script**: Generate a compelling voice-over script with LLaMA. - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS. - **Music Production**: Produce custom music tracks with MusicGen. - **Audio Blending**: Seamlessly blend voice and music with options for ducking. """) with gr.Tabs(): # Step 1: Generate Script with gr.Tab("📝 Script Generation"): with gr.Row(): user_prompt = gr.Textbox( label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show...", lines=2 ) with gr.Row(): llama_model_id = gr.Textbox( label="LLaMA Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct", placeholder="Enter a valid Hugging Face model ID" ) duration = gr.Slider( label="Desired Promo Duration (seconds)", minimum=15, maximum=60, step=15, value=30 ) generate_script_button = gr.Button("Generate Script", variant="primary") script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False) sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False) music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False) generate_script_button.click( fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur), inputs=[user_prompt, llama_model_id, duration], outputs=[script_output, sound_design_output, music_suggestion_output], ) # Step 2: Generate Voice with gr.Tab("🎤 Voice Synthesis"): gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.") selected_tts_model = gr.Dropdown( label="TTS Model", choices=[ "tts_models/en/ljspeech/tacotron2-DDC", "tts_models/en/ljspeech/vits", "tts_models/en/sam/tacotron-DDC", ], value="tts_models/en/ljspeech/tacotron2-DDC", multiselect=False ) generate_voice_button = gr.Button("Generate Voice-Over", variant="primary") voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath") generate_voice_button.click( fn=lambda script, tts_model: generate_voice(script, tts_model), inputs=[script_output, selected_tts_model], outputs=voice_audio_output, ) # Step 3: Generate Music with gr.Tab("🎶 Music Production"): gr.Markdown("Generate a custom music track using the **MusicGen Large** model.") audio_length = gr.Slider( label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512, info="Increase tokens for longer audio (inference time may vary)." ) generate_music_button = gr.Button("Generate Music", variant="primary") music_output = gr.Audio(label="Generated Music (WAV)", type="filepath") generate_music_button.click( fn=lambda music_suggestion, length: generate_music(music_suggestion, length), inputs=[music_suggestion_output, audio_length], outputs=[music_output], ) # Step 4: Blend Audio with gr.Tab("🎚️ Audio Blending"): gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.") ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True) duck_level_slider = gr.Slider( label="Ducking Level (dB attenuation)", minimum=0, maximum=20, step=1, value=10 ) blend_button = gr.Button("Blend Voice + Music", variant="primary") blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath") blend_button.click( fn=blend_audio, inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider], outputs=blended_output ) # Footer gr.Markdown(""" """) # Visitor Badge gr.HTML(""" """) demo.launch(debug=True)