Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

Bils commited on 9 days ago

Commit

216e869

verified ·

1 Parent(s): 1653c85

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -30

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pydub import AudioSegment
 from dotenv import load_dotenv
 import spaces
 import gradio as gr
 # Transformers & Models
 from transformers import (
@@ -17,9 +18,13 @@ from transformers import (
     AutoProcessor,
     MusicgenForConditionalGeneration,
 )
 # Coqui TTS
 from TTS.api import TTS
 # ---------------------------------------------------------------------
 # Setup Logging and Environment Variables
 # ---------------------------------------------------------------------
@@ -33,6 +38,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 LLAMA_PIPELINES = {}
 MUSICGEN_MODELS = {}
 TTS_MODELS = {}
 # ---------------------------------------------------------------------
 # Utility Function
@@ -65,7 +71,6 @@ def get_llama_pipeline(model_id: str, token: str):
     LLAMA_PIPELINES[model_id] = text_pipeline
     return text_pipeline
 def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     """
     Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
@@ -81,7 +86,6 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     MUSICGEN_MODELS[model_key] = (model, processor)
     return model, processor
 def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
     Returns a cached TTS model if available; otherwise, loads and caches it.
@@ -93,6 +97,16 @@ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     TTS_MODELS[model_name] = tts_model
     return tts_model
 # ---------------------------------------------------------------------
 # Script Generation Function
@@ -127,7 +141,6 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
         pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
         match = re.search(pattern, generated_text, re.DOTALL)
         if match:
@@ -143,7 +156,6 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
         logging.exception("Error generating script")
         return f"Error generating script: {e}", "", ""
 # ---------------------------------------------------------------------
 # Voice-Over Generation Function
 # ---------------------------------------------------------------------
@@ -168,7 +180,6 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
         logging.exception("Error generating voice")
         return f"Error generating voice: {e}"
 # ---------------------------------------------------------------------
 # Music Generation Function
 # ---------------------------------------------------------------------
@@ -202,43 +213,85 @@ def generate_music(prompt: str, audio_length: int):
         logging.exception("Error generating music")
         return f"Error generating music: {e}"
 # ---------------------------------------------------------------------
-# Audio Blending with Duration Sync & Ducking
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
-def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
     """
-    Blends two audio files (voice and music).
-      - Loops music if shorter than voice.
-      - Trims music if longer than voice.
-      - Applies ducking to lower music volume during voice segments if enabled.
     Returns the file path to the blended .wav file.
     """
     try:
-        if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
-            return "Error: Missing audio files for blending."
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
-        voice_len = len(voice)
         if len(music) < voice_len:
             looped_music = AudioSegment.empty()
             while len(looped_music) < voice_len:
                 looped_music += music
             music = looped_music
         music = music[:voice_len]
         if ducking:
-            ducked_music = music - duck_level
-            final_audio = ducked_music.overlay(voice)
-        else:
-            final_audio = music.overlay(voice)
         output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
         final_audio.export(output_path, format="wav")
@@ -248,7 +301,6 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
         logging.exception("Error blending audio")
         return f"Error blending audio: {e}"
 # ---------------------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------------------
@@ -298,20 +350,21 @@ with gr.Blocks(css="""
         <p>Your all-in-one AI solution for creating professional audio ads.</p>
         """)
     gr.Markdown("""
     **Welcome to Ai Ads Promo!**
     This simple, easy-to-use app helps you create amazing audio ads in just a few steps. Here’s how it works:
     1. **Script Generation:**
-       - Share your idea and let our AI craft a clear and engaging voice-over script.
     2. **Voice Synthesis:**
        - Convert your script into a natural-sounding voice-over using advanced text-to-speech technology.
     3. **Music Production:**
        - Generate a custom music track that perfectly fits your ad.
-    4. **Audio Blending:**
-       - Combine your voice-over and music seamlessly. You can even adjust the music volume (ducking) when the voice plays.
     **Benefits:**
     - **Easy to Use:** Designed for everyone – no technical skills required.
@@ -396,9 +449,21 @@ with gr.Blocks(css="""
                 outputs=[music_output],
             )
-        # Step 4: Audio Blending
         with gr.Tab("🎚️ Audio Blending"):
-            gr.Markdown("Blend your voice-over and music track. Music will be looped or trimmed to match your voice duration. Enable ducking to lower the music while the voice plays.")
             ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
             duck_level_slider = gr.Slider(
                 label="Ducking Level (dB attenuation)",
@@ -407,12 +472,12 @@ with gr.Blocks(css="""
                 step=1,
                 value=10
             )
-            blend_button = gr.Button("Blend Voice + Music", variant="primary")
             blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
             blend_button.click(
                 fn=blend_audio,
-                inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
                 outputs=blended_output
             )

 from dotenv import load_dotenv
 import spaces
 import gradio as gr
+import numpy as np
 # Transformers & Models
 from transformers import (
     AutoProcessor,
     MusicgenForConditionalGeneration,
 )
 # Coqui TTS
 from TTS.api import TTS
+# Diffusers for sound design generation
+from diffusers import DiffusionPipeline
 # ---------------------------------------------------------------------
 # Setup Logging and Environment Variables
 # ---------------------------------------------------------------------
 LLAMA_PIPELINES = {}
 MUSICGEN_MODELS = {}
 TTS_MODELS = {}
+SOUND_DESIGN_PIPELINES = {}
 # ---------------------------------------------------------------------
 # Utility Function
     LLAMA_PIPELINES[model_id] = text_pipeline
     return text_pipeline
 def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     """
     Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
     MUSICGEN_MODELS[model_key] = (model, processor)
     return model, processor
 def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
     Returns a cached TTS model if available; otherwise, loads and caches it.
     TTS_MODELS[model_name] = tts_model
     return tts_model
+def get_sound_design_pipeline(model_name: str, token: str):
+    """
+    Returns a cached DiffusionPipeline for sound design if available;
+    otherwise, it loads and caches the pipeline.
+    """
+    if model_name in SOUND_DESIGN_PIPELINES:
+        return SOUND_DESIGN_PIPELINES[model_name]
+    pipe = DiffusionPipeline.from_pretrained(model_name, use_auth_token=token)
+    SOUND_DESIGN_PIPELINES[model_name] = pipe
+    return pipe
 # ---------------------------------------------------------------------
 # Script Generation Function
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
         pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
         match = re.search(pattern, generated_text, re.DOTALL)
         if match:
         logging.exception("Error generating script")
         return f"Error generating script: {e}", "", ""
 # ---------------------------------------------------------------------
 # Voice-Over Generation Function
 # ---------------------------------------------------------------------
         logging.exception("Error generating voice")
         return f"Error generating voice: {e}"
 # ---------------------------------------------------------------------
 # Music Generation Function
 # ---------------------------------------------------------------------
         logging.exception("Error generating music")
         return f"Error generating music: {e}"
+# ---------------------------------------------------------------------
+# Sound Design Generation Function
+# ---------------------------------------------------------------------
+@spaces.GPU(duration=200)
+def generate_sound_design(prompt: str):
+    """
+    Generates a sound design audio file based on the provided prompt using Audioldm2.
+    Returns the file path to the generated .wav file.
+    """
+    try:
+        if not prompt.strip():
+            return "Error: No sound design suggestion provided."
+        pipe = get_sound_design_pipeline("cvssp/audioldm2", HF_TOKEN)
+        # Generate audio from the prompt; assumes the pipeline returns a dict with key 'audios'
+        result = pipe(prompt)
+        audio_samples = result["audios"][0]
+        normalized_audio = (audio_samples / np.max(np.abs(audio_samples)) * 32767).astype("int16")
+        output_path = os.path.join(tempfile.gettempdir(), "sound_design_generated.wav")
+        write(output_path, 44100, normalized_audio)
+        return output_path
+    except Exception as e:
+        logging.exception("Error generating sound design")
+        return f"Error generating sound design: {e}"
 # ---------------------------------------------------------------------
+# Audio Blending with Duration Sync & Ducking (Voice + Sound Design + Music)
 # ---------------------------------------------------------------------
 @spaces.GPU(duration=100)
+def blend_audio(voice_path: str, sound_effect_path: str, music_path: str, ducking: bool, duck_level: int = 10):
     """
+    Blends three audio files (voice, sound design/sound effect, and music):
+      - Loops music and sound design if shorter than the voice track.
+      - Trims both to match the voice duration.
+      - Applies ducking to lower music and sound design volumes during voice segments if enabled.
     Returns the file path to the blended .wav file.
     """
     try:
+        # Verify input files exist
+        for path in [voice_path, sound_effect_path, music_path]:
+            if not os.path.isfile(path):
+                return f"Error: Missing audio file for {path}"
+        # Load audio segments
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
+        sound_effect = AudioSegment.from_wav(sound_effect_path)
+        voice_len = len(voice)  # duration in milliseconds
+        # Loop or trim music to match voice duration
         if len(music) < voice_len:
             looped_music = AudioSegment.empty()
             while len(looped_music) < voice_len:
                 looped_music += music
             music = looped_music
         music = music[:voice_len]
+        # Loop or trim sound effect to match voice duration
+        if len(sound_effect) < voice_len:
+            looped_effect = AudioSegment.empty()
+            while len(looped_effect) < voice_len:
+                looped_effect += sound_effect
+            sound_effect = looped_effect
+        sound_effect = sound_effect[:voice_len]
+        # Apply ducking to background tracks if enabled
         if ducking:
+            music = music - duck_level
+            sound_effect = sound_effect - duck_level
+        # Combine music and sound effect into a background track
+        background = music.overlay(sound_effect)
+        # Overlay voice on top of the background
+        final_audio = background.overlay(voice)
         output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
         final_audio.export(output_path, format="wav")
         logging.exception("Error blending audio")
         return f"Error blending audio: {e}"
 # ---------------------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------------------
         <p>Your all-in-one AI solution for creating professional audio ads.</p>
         """)
     gr.Markdown("""
     **Welcome to Ai Ads Promo!**
     This simple, easy-to-use app helps you create amazing audio ads in just a few steps. Here’s how it works:
     1. **Script Generation:**
+       - Share your idea and let our AI craft a clear and engaging voice-over script, along with sound design and music suggestions.
     2. **Voice Synthesis:**
        - Convert your script into a natural-sounding voice-over using advanced text-to-speech technology.
     3. **Music Production:**
        - Generate a custom music track that perfectly fits your ad.
+    4. **Sound Design:**
+       - Generate creative sound effects based on our sound design suggestions.
+    5. **Audio Blending:**
+       - Combine your voice-over, sound effects, and music seamlessly. Enable ducking to lower background audio during voice segments.
     **Benefits:**
     - **Easy to Use:** Designed for everyone – no technical skills required.
                 outputs=[music_output],
             )
+        # Step 4: Sound Design Generation
+        with gr.Tab("🎧 Sound Design Generation"):
+            gr.Markdown("Generate a creative sound design track based on the sound design suggestions from the script.")
+            generate_sound_design_button = gr.Button("Generate Sound Design", variant="primary")
+            sound_design_audio_output = gr.Audio(label="Generated Sound Design (WAV)", type="filepath")
+            generate_sound_design_button.click(
+                fn=generate_sound_design,
+                inputs=[sound_design_output],
+                outputs=[sound_design_audio_output],
+            )
+        # Step 5: Audio Blending (Voice + Sound Design + Music)
         with gr.Tab("🎚️ Audio Blending"):
+            gr.Markdown("Blend your voice-over, sound design, and music track. The background audio (music and sound design) can be ducked during voice segments.")
             ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
             duck_level_slider = gr.Slider(
                 label="Ducking Level (dB attenuation)",
                 step=1,
                 value=10
             )
+            blend_button = gr.Button("Blend Audio", variant="primary")
             blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
             blend_button.click(
                 fn=blend_audio,
+                inputs=[voice_audio_output, sound_design_audio_output, music_output, ducking_checkbox, duck_level_slider],
                 outputs=blended_output
             )