Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 29

Commit

4d9e689

verified ·

1 Parent(s): bacf407

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -109

app.py CHANGED Viewed

@@ -1,145 +1,260 @@
-import spaces
 import os
 import tempfile
-import gradio as gr
-from dotenv import load_dotenv
 import torch
 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from pathlib import Path
 load_dotenv()
-hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
-captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
-    device=device_id
-)
-pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
-)
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
-            return "No caption was generated.", True
-        return caption, False
     except Exception as e:
-        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
-        audio_output = pipe(
-            prompt=caption,
-            num_inference_steps=50,
-            guidance_scale=7.5
-        )
-        pipe.to("cpu")
-        audio = audio_output.audios[0]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
-            return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
-        return None
 css = """
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-    }
 """
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
         """)
-    gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
-    """)
-    image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)
-    generate_sound_button = gr.Button("Generate Sound Effect")
-    audio_output = gr.Audio(label="Generated Sound Effect")
-    gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
-    For support, questions, or to contribute, please contact us at
-    [[email protected]](mailto:[email protected]).
-    Support our work and get involved by donating through
-    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
-    """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
-        return description
-    def generate_sound(description):
-        if not description or description.startswith("Error"):
-            return None
-        audio_path = get_audioldm_from_caption(description)
-        return audio_path
-    generate_description_button.click(
-        fn=update_caption,
-        inputs=image_upload,
-        outputs=caption_display
-    )
-    generate_sound_button.click(
-        fn=generate_sound,
-        inputs=caption_display,
-        outputs=audio_output
     )
-    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
-    html = gr.HTML()
-demo.launch(debug=True, share=True)

+import gradio as gr
 import os
 import tempfile
 import torch
+import numpy as np
 from scipy.io.wavfile import write
+from dotenv import load_dotenv
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from PIL import Image
+import io
+from pydub import AudioSegment
+from typing import List
+from huggingface_hub import spaces
+# Load environment variables
 load_dotenv()
+HF_TOKEN = os.getenv("HF_TKN")
+# Device configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize models
+@gr.cache()
+def load_caption_model():
+    return pipeline(
+        "image-to-text",
+        model="Salesforce/blip-image-captioning-base",
+        device=device
+    )
+@gr.cache()
+def load_audio_model():
+    pipe = DiffusionPipeline.from_pretrained(
+        "cvssp/audioldm2",
+        use_auth_token=HF_TOKEN
+    )
+    return pipe
+caption_pipe = load_caption_model()
+audio_pipe = load_audio_model().to(device)
 @spaces.GPU(duration=120)
+def analyze_image(image_file):
+    """Generate caption from image with validation"""
     try:
+        # Validate image
+        try:
+            image = Image.open(io.BytesIO(image_file))
+            image.verify()
+            image = Image.open(io.BytesIO(image_file))
+        except Exception as e:
+            raise ValueError(f"Invalid image file: {str(e)}")
+        results = caption_pipe(image)
         if not results or not isinstance(results, list):
+            raise RuntimeError("No caption generated")
         caption = results[0].get("generated_text", "").strip()
         if not caption:
+            raise RuntimeError("Empty caption generated")
+        return caption
     except Exception as e:
+        raise gr.Error(f"Image processing error: {str(e)}")
 @spaces.GPU(duration=120)
+def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
+    """Generate audio from single prompt"""
     try:
+        if not prompt or len(prompt) < 10:
+            raise ValueError("Prompt must be at least 10 characters")
+        with torch.inference_mode():
+            audio = audio_pipe(
+                prompt=prompt,
+                num_inference_steps=int(num_steps),
+                guidance_scale=guidance_scale,
+                audio_length_in_s=10
+            ).audios[0]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
+            write(tmpfile.name, 16000, audio)
+            return tmpfile.name
     except Exception as e:
+        raise gr.Error(f"Audio generation error: {str(e)}")
+@spaces.GPU(duration=120)
+def blend_audios(audio_files: List[str]) -> str:
+    """Mix multiple audio files into one"""
+    try:
+        if not audio_files:
+            raise ValueError("No audio files to blend")
+        # Load first audio to get base parameters
+        base_audio = AudioSegment.from_wav(audio_files[0])
+        mixed = base_audio
+        # Mix subsequent tracks
+        for file in audio_files[1:]:
+            track = AudioSegment.from_wav(file)
+            if len(track) > len(mixed):
+                mixed = mixed.overlay(track[:len(mixed)])
+            else:
+                mixed = mixed.overlay(track)
+        # Export mixed audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
+            mixed.export(tmpfile.name, format="wav")
+            return tmpfile.name
+    except Exception as e:
+        raise gr.Error(f"Audio mixing error: {str(e)}")
+def process_inputs(input_choice, image_file, *prompts):
+    """Handle both image and text input modes"""
+    try:
+        # Filter empty prompts
+        valid_prompts = [p.strip() for p in prompts if p.strip()]
+        if input_choice == "Image":
+            if not image_file:
+                raise gr.Error("Please upload an image")
+            main_prompt = analyze_image(image_file)
+            valid_prompts = [main_prompt] + valid_prompts
+        else:
+            if not valid_prompts:
+                raise gr.Error("Please enter at least one text prompt")
+        # Generate audio for each prompt
+        audio_files = []
+        for idx, prompt in enumerate(valid_prompts):
+            audio_path = generate_audio(prompt)
+            audio_files.append(audio_path)
+        # Blend all audio files
+        final_audio = blend_audios(audio_files)
+        return valid_prompts, final_audio, audio_files
+    except Exception as e:
+        raise gr.Error(str(e))
+# Gradio interface
 css = """
+#main-container { max-width: 800px; margin: 0 auto; }
+.dark { background: #1a1a1a; }
+.prompt-box { margin-bottom: 10px; }
+.audio-track { margin: 5px 0; }
 """
+with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
+    with gr.Column(elem_id="main-container"):
+        gr.Markdown("""
+        # 🎨 Image to Sound Generator
+        Transform visual content or text prompts into mixed sound effects!
         """)
+        # Input Mode Selector
+        input_choice = gr.Radio(
+            choices=["Image", "Text"],
+            value="Image",
+            label="Input Mode",
+            interactive=True
+        )
+        # Image Input Section
+        with gr.Row(visible=True) as image_row:
+            image_input = gr.Image(type="filepath", label="Upload Image")
+        # Text Input Section
+        with gr.Column(visible=False) as text_inputs_col:
+            prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
+            add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")
+        # Dynamic prompt management
+        current_prompts = gr.State(value=3)
+        def add_prompt(current_count):
+            new_count = current_count + 1
+            new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
+            return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)
+        add_prompt_btn.click(
+            fn=add_prompt,
+            inputs=current_prompts,
+            outputs=[current_prompts] + prompt_components + [text_inputs_col]
+        )
+        # Toggle between image/text inputs
+        def toggle_inputs(choice):
+            if choice == "Image":
+                return [gr.update(visible=True), gr.update(visible=False)]
+            return [gr.update(visible=False), gr.update(visible=True)]
+        input_choice.change(
+            fn=toggle_inputs,
+            inputs=input_choice,
+            outputs=[image_row, text_inputs_col]
+        )
+        # Generation Controls
+        with gr.Accordion("Advanced Settings", open=False):
+            steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
+            guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")
+        generate_btn = gr.Button("Generate Mixed Sound", variant="primary")
+        # Outputs
+        with gr.Column():
+            gr.Markdown("### Generation Results")
+            prompt_display = gr.JSON(label="Used Prompts")
+            final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)
+            with gr.Accordion("Individual Tracks", open=False):
+                track_components = [gr.Audio(visible=False) for _ in range(5)]
+        # Examples
+        gr.Examples(
+            examples=[
+                ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
+                [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
+            ],
+            inputs=[image_input] + prompt_components[:2],
+            outputs=[prompt_display, final_audio],
+            fn=lambda *x: process_inputs("Image", *x),
+            cache_examples=True
+        )
+        # Contribution Section
+        with gr.Column():
+            gr.Markdown("""
+            ## 👥 How You Can Contribute
+            We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
+            Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
+            """)
+            gr.HTML("""
+            <div style="text-align: center;">
+                <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
+                    <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
+                </a>
+            </div>
+            """)
+        # Footer
+        gr.Markdown("""
+        ---
+        [GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
+        """)
+    # Event handling
+    generate_btn.click(
+        fn=process_inputs,
+        inputs=[input_choice, image_input] + prompt_components,
+        outputs=[prompt_display, final_audio, *track_components]
     )
+if __name__ == "__main__":
+    app.launch(debug=True, share=True)