Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

Bils commited on Jan 27

Commit

559ca26

verified ·

1 Parent(s): 2925d53

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -70

app.py CHANGED Viewed

@@ -13,22 +13,22 @@ from pydub import AudioSegment
 from dotenv import load_dotenv
 import tempfile
 import spaces
 from TTS.api import TTS
-from TTS.utils.synthesizer import Synthesizer
 # ---------------------------------------------------------------------
 # Load Environment Variables
 # ---------------------------------------------------------------------
 load_dotenv()
-HF_TOKEN = os.getenv("HF_TOKEN")
 # ---------------------------------------------------------------------
 # Global Model Caches
 # ---------------------------------------------------------------------
-# We store models/pipelines in global variables for reuse,
-# so they are only loaded once.
 LLAMA_PIPELINES = {}
 MUSICGEN_MODELS = {}
 # ---------------------------------------------------------------------
 # Helper Functions
@@ -36,12 +36,10 @@ MUSICGEN_MODELS = {}
 def get_llama_pipeline(model_id: str, token: str):
     """
     Returns a cached LLaMA pipeline if available; otherwise, loads it.
-    This significantly reduces loading time for repeated calls.
     """
     if model_id in LLAMA_PIPELINES:
         return LLAMA_PIPELINES[model_id]
-    # Load new pipeline and store in cache
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
@@ -55,14 +53,14 @@ def get_llama_pipeline(model_id: str, token: str):
     return text_pipeline
-def get_musicgen_model(model_key: str = "facebook/musicgen-medium"):
     """
     Returns a cached MusicGen model if available; otherwise, loads it.
     """
     if model_key in MUSICGEN_MODELS:
         return MUSICGEN_MODELS[model_key]
-    # Load new MusicGen model and store in cache
     model = MusicgenForConditionalGeneration.from_pretrained(model_key)
     processor = AutoProcessor.from_pretrained(model_key)
@@ -73,6 +71,18 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-medium"):
     return model, processor
 # ---------------------------------------------------------------------
 # Script Generation Function
 # ---------------------------------------------------------------------
@@ -85,7 +95,6 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
     try:
         text_pipeline = get_llama_pipeline(model_id, token)
-        # System prompt with clear structure instructions
         system_prompt = (
             "You are an expert radio imaging producer specializing in sound design and music. "
             f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
@@ -93,10 +102,8 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
             "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
             "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
         )
         combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
-        # Use inference mode for efficient forward passes
         with torch.inference_mode():
             result = text_pipeline(
                 combined_prompt,
@@ -105,38 +112,37 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
                 temperature=0.8
             )
-        # LLaMA pipeline returns a list of dicts with "generated_text"
         generated_text = result[0]["generated_text"]
-        # Basic parsing to isolate everything after "Output:"
-        # (in case the model repeated your system prompt).
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
-        # Extract sections based on known prefixes
         voice_script = "No voice-over script found."
         sound_design = "No sound design suggestions found."
         music_suggestions = "No music suggestions found."
         if "Voice-Over Script:" in generated_text:
             parts = generated_text.split("Voice-Over Script:")
-            if len(parts) > 1:
-                # Everything after "Voice-Over Script:" up until next prefix
-                voice_script_part = parts[1]
-                voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip() \
-                    if "Sound Design Suggestions:" in voice_script_part else voice_script_part.strip()
         if "Sound Design Suggestions:" in generated_text:
             parts = generated_text.split("Sound Design Suggestions:")
-            if len(parts) > 1:
-                sound_design_part = parts[1]
-                sound_design = sound_design_part.split("Music Suggestions:")[0].strip() \
-                    if "Music Suggestions:" in sound_design_part else sound_design_part.strip()
         if "Music Suggestions:" in generated_text:
             parts = generated_text.split("Music Suggestions:")
-            if len(parts) > 1:
-                music_suggestions = parts[1].strip()
         return voice_script, sound_design, music_suggestions
@@ -145,46 +151,55 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
 # ---------------------------------------------------------------------
-# Voice-Over Generation Function (Inactive)
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
-def generate_voice(script: str, speaker: str = "default"):
     """
-    Placeholder for future voice-over generation functionality.
     """
     try:
-        return "Voice-over generation is currently inactive."
     except Exception as e:
-        return f"Error: {e}"
 # ---------------------------------------------------------------------
-# Music Generation Function
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
 def generate_music(prompt: str, audio_length: int):
     """
-    Generates music from the 'facebook/musicgen-medium' model based on the prompt.
     Returns the file path to the generated .wav file.
     """
     try:
-        model_key = "facebook/musicgen-medium"
         musicgen_model, musicgen_processor = get_musicgen_model(model_key)
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Prepare input
         inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
-        # Generate music within inference mode
         with torch.inference_mode():
             outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
         audio_data = outputs[0, 0].cpu().numpy()
-        # Normalize audio to int16 format
         normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
-        # Save generated music to a temp file
-        output_path = f"{tempfile.gettempdir()}/musicgen_medium_generated_music.wav"
         write(output_path, 44100, normalized_audio)
         return output_path
@@ -194,16 +209,46 @@ def generate_music(prompt: str, audio_length: int):
 # ---------------------------------------------------------------------
-# Audio Blending Function (Inactive)
 # ---------------------------------------------------------------------
-def blend_audio(voice_path: str, music_path: str, ducking: bool):
     """
-    Placeholder for future audio blending functionality with optional ducking.
     """
     try:
-        return "Audio blending functionality is currently inactive."
     except Exception as e:
-        return f"Error: {e}"
 # ---------------------------------------------------------------------
@@ -211,9 +256,15 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool):
 # ---------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 🎧 AI Promo Studio 🚀
-    Welcome to **AI Promo Studio**, your one-stop solution for creating stunning and professional radio promos with ease!
-    Whether you're a sound designer, radio producer, or content creator, our AI-driven tools, powered by advanced LLM Llama models, empower you to bring your vision to life in just a few steps.
     """)
     with gr.Tabs():
@@ -249,24 +300,39 @@ with gr.Blocks() as demo:
                 outputs=[script_output, sound_design_output, music_suggestion_output],
             )
-        # Step 2: Generate Voice (Inactive)
         with gr.Tab("Step 2: Generate Voice"):
-            gr.Markdown("""
-            **Note:** Voice-over generation is currently inactive.
-            This feature will be available in future updates!
-            """)
-        # Step 3: Generate Music
         with gr.Tab("Step 3: Generate Music"):
-            with gr.Row():
-                audio_length = gr.Slider(
-                    label="Music Length (tokens)",
-                    minimum=128,
-                    maximum=1024,
-                    step=64,
-                    value=512,
-                    info="Increase tokens for longer audio, but be mindful of inference time."
-                )
             generate_music_button = gr.Button("Generate Music")
             music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
@@ -276,14 +342,27 @@ with gr.Blocks() as demo:
                 outputs=[music_output],
             )
-        # Step 4: Blend Audio (Inactive)
         with gr.Tab("Step 4: Blend Audio"):
-            gr.Markdown("""
-            **Note:** Audio blending functionality is currently inactive.
-            This feature will be available in future updates!
-            """)
-    # Footer / Credits
     gr.Markdown("""
     <hr>
     <p style="text-align: center; font-size: 0.9em;">
@@ -298,5 +377,4 @@ with gr.Blocks() as demo:
     </a>
     """)
-# Launch the Gradio app
 demo.launch(debug=True)

 from dotenv import load_dotenv
 import tempfile
 import spaces
+# Coqui TTS
 from TTS.api import TTS
 # ---------------------------------------------------------------------
 # Load Environment Variables
 # ---------------------------------------------------------------------
 load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")  # Adjust if needed
 # ---------------------------------------------------------------------
 # Global Model Caches
 # ---------------------------------------------------------------------
 LLAMA_PIPELINES = {}
 MUSICGEN_MODELS = {}
+TTS_MODELS = {}
 # ---------------------------------------------------------------------
 # Helper Functions
 def get_llama_pipeline(model_id: str, token: str):
     """
     Returns a cached LLaMA pipeline if available; otherwise, loads it.
     """
     if model_id in LLAMA_PIPELINES:
         return LLAMA_PIPELINES[model_id]
     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
     return text_pipeline
+def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     """
     Returns a cached MusicGen model if available; otherwise, loads it.
+    Uses the 'large' variant for higher quality outputs.
     """
     if model_key in MUSICGEN_MODELS:
         return MUSICGEN_MODELS[model_key]
     model = MusicgenForConditionalGeneration.from_pretrained(model_key)
     processor = AutoProcessor.from_pretrained(model_key)
     return model, processor
+def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
+    """
+    Returns a cached TTS model if available; otherwise, loads it.
+    """
+    if model_name in TTS_MODELS:
+        return TTS_MODELS[model_name]
+    tts_model = TTS(model_name)
+    TTS_MODELS[model_name] = tts_model
+    return tts_model
 # ---------------------------------------------------------------------
 # Script Generation Function
 # ---------------------------------------------------------------------
     try:
         text_pipeline = get_llama_pipeline(model_id, token)
         system_prompt = (
             "You are an expert radio imaging producer specializing in sound design and music. "
             f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
             "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
             "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
         )
         combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
         with torch.inference_mode():
             result = text_pipeline(
                 combined_prompt,
                 temperature=0.8
             )
         generated_text = result[0]["generated_text"]
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
+        # Default placeholders
         voice_script = "No voice-over script found."
         sound_design = "No sound design suggestions found."
         music_suggestions = "No music suggestions found."
+        # Voice-Over Script
         if "Voice-Over Script:" in generated_text:
             parts = generated_text.split("Voice-Over Script:")
+            voice_script_part = parts[1]
+            if "Sound Design Suggestions:" in voice_script_part:
+                voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
+            else:
+                voice_script = voice_script_part.strip()
+        # Sound Design
         if "Sound Design Suggestions:" in generated_text:
             parts = generated_text.split("Sound Design Suggestions:")
+            sound_design_part = parts[1]
+            if "Music Suggestions:" in sound_design_part:
+                sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
+            else:
+                sound_design = sound_design_part.strip()
+        # Music Suggestions
         if "Music Suggestions:" in generated_text:
             parts = generated_text.split("Music Suggestions:")
+            music_suggestions = parts[1].strip()
         return voice_script, sound_design, music_suggestions
 # ---------------------------------------------------------------------
+# Voice-Over Generation Function
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
+def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
+    Generates a voice-over from the provided script using the Coqui TTS model.
+    Returns the file path to the generated .wav file.
     """
     try:
+        if not script.strip():
+            return "Error: No script provided."
+        tts_model = get_tts_model(tts_model_name)
+        # Generate and save voice
+        output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
+        tts_model.tts_to_file(text=script, file_path=output_path)
+        return output_path
     except Exception as e:
+        return f"Error generating voice: {e}"
 # ---------------------------------------------------------------------
+# Music Generation Function (Using facebook/musicgen-large)
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
 def generate_music(prompt: str, audio_length: int):
     """
+    Generates music from the 'facebook/musicgen-large' model based on the prompt.
     Returns the file path to the generated .wav file.
     """
     try:
+        if not prompt.strip():
+            return "Error: No music suggestion provided."
+        model_key = "facebook/musicgen-large"
         musicgen_model, musicgen_processor = get_musicgen_model(model_key)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
         with torch.inference_mode():
             outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
         audio_data = outputs[0, 0].cpu().numpy()
         normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
+        output_path = f"{tempfile.gettempdir()}/musicgen_large_generated_music.wav"
         write(output_path, 44100, normalized_audio)
         return output_path
 # ---------------------------------------------------------------------
+# Audio Blending Function with Ducking
 # ---------------------------------------------------------------------
+@spaces.GPU(duration=100)
+def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
     """
+    Blends two audio files (voice and music). If ducking=True,
+    the music is attenuated by 'duck_level' dB while the voice is playing.
+    Returns the file path to the blended .wav file.
     """
     try:
+        if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
+            return "Error: Missing audio files for blending."
+        voice = AudioSegment.from_wav(voice_path)
+        music = AudioSegment.from_wav(music_path)
+        # If the voice is longer than the music, extend music with silence
+        if len(voice) > len(music):
+            extension = AudioSegment.silent(duration=(len(voice) - len(music)))
+            music = music + extension
+        if ducking:
+            # Step 1: Reduce music by `duck_level` dB for the portion matching the voice duration
+            ducked_music_part = music[:len(voice)] - duck_level
+            # Overlay voice on top of the ducked music portion
+            voice_overlaid = ducked_music_part.overlay(voice)
+            # Step 2: Keep the rest of the music as-is
+            remainder = music[len(voice):]
+            final_audio = voice_overlaid + remainder
+        else:
+            # No ducking, just overlay
+            final_audio = music.overlay(voice)
+        output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
+        final_audio.export(output_path, format="wav")
+        return output_path
     except Exception as e:
+        return f"Error blending audio: {e}"
 # ---------------------------------------------------------------------
 # ---------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 🎧 AI Promo Studio with MusicGen Large, Voice Over & Audio Blending 🚀
+    Welcome to **AI Promo Studio**!
+    This pipeline uses **facebook/musicgen-large** for high-quality background music (more resource-intensive).
+    **Workflow**:
+    1. **Generate Script** (via LLaMA)
+    2. **Generate Voice-Over** (via Coqui TTS)
+    3. **Generate Music** (via MusicGen Large)
+    4. **Blend** (Voice + Music) with optional ducking
     """)
     with gr.Tabs():
                 outputs=[script_output, sound_design_output, music_suggestion_output],
             )
+        # Step 2: Generate Voice
         with gr.Tab("Step 2: Generate Voice"):
+            gr.Markdown("Generate the voice-over using a Coqui TTS model.")
+            selected_tts_model = gr.Dropdown(
+                label="TTS Model",
+                choices=[
+                    "tts_models/en/ljspeech/tacotron2-DDC",
+                    "tts_models/en/ljspeech/vits",
+                    "tts_models/en/sam/tacotron-DDC",
+                ],
+                value="tts_models/en/ljspeech/tacotron2-DDC",
+                multiselect=False
+            )
+            generate_voice_button = gr.Button("Generate Voice-Over")
+            voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
+            generate_voice_button.click(
+                fn=lambda script, tts_model: generate_voice(script, tts_model),
+                inputs=[script_output, selected_tts_model],
+                outputs=voice_audio_output,
+            )
+        # Step 3: Generate Music (MusicGen Large)
         with gr.Tab("Step 3: Generate Music"):
+            gr.Markdown("Generate a music track with the **MusicGen Large** model.")
+            audio_length = gr.Slider(
+                label="Music Length (tokens)",
+                minimum=128,
+                maximum=1024,
+                step=64,
+                value=512,
+                info="Increase tokens for longer audio, but be mindful of inference time."
+            )
             generate_music_button = gr.Button("Generate Music")
             music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
                 outputs=[music_output],
             )
+        # Step 4: Blend Audio
         with gr.Tab("Step 4: Blend Audio"):
+            gr.Markdown("Combine voice-over and music, optionally applying ducking.")
+            ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
+            duck_level_slider = gr.Slider(
+                label="Ducking Level (dB attenuation)",
+                minimum=0,
+                maximum=20,
+                step=1,
+                value=10
+            )
+            blend_button = gr.Button("Blend Voice + Music")
+            blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
+            blend_button.click(
+                fn=blend_audio,
+                inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
+                outputs=blended_output
+            )
+    # Footer
     gr.Markdown("""
     <hr>
     <p style="text-align: center; font-size: 0.9em;">
     </a>
     """)
 demo.launch(debug=True)