Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 31

Commit

041bd28

verified ·

1 Parent(s): 6a5d04a

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -127

app.py CHANGED Viewed

@@ -1,165 +1,145 @@
-import io
-from pathlib import Path
-from typing import Tuple, Optional
 import gradio as gr
-import numpy as np
-import torch
-from PIL import Image
 from dotenv import load_dotenv
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from huggingface_hub import login
-import os
-# Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-if hf_token:
-    login(token=hf_token)
-# Device configuration
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Load models
-@spaces.GPU
-def load_models():
-    """Load both models with proper device placement"""
-    caption_pipe = pipeline(
-        "image-to-text",
-        model="nlpconnect/vit-gpt2-image-captioning",
-        device=device
-    )
-    audio_pipe = DiffusionPipeline.from_pretrained(
-        "cvssp/audioldm2",
-        token=hf_token,
-        torch_dtype=torch_dtype
-    )
-    return caption_pipe, audio_pipe
-caption_pipe, audio_pipe = load_models()
-def analyze_image(image_bytes: bytes) -> Tuple[str, bool]:
-    """Generate caption from image bytes with enhanced error handling"""
     try:
-        image = Image.open(io.BytesIO(image_bytes))
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        results = caption_pipe(image)
         if not results or not isinstance(results, list):
-            return "Error: Invalid response from caption model", True
-        caption = results[0].get("generated_text", "").strip()
-        return caption or "No caption generated", not bool(caption)
     except Exception as e:
-        return f"Image processing error: {str(e)}", True
 @spaces.GPU(duration=120)
-def generate_audio(caption: str) -> Optional[Tuple[int, np.ndarray]]:
-    """Generate audio from caption with resource management"""
     try:
-        # Device management with context
-        original_device = next(audio_pipe.parameters()).device
-        audio_pipe.to(device)
-        # Generation with progress awareness
-        audio = audio_pipe(
             prompt=caption,
             num_inference_steps=50,
-            guidance_scale=7.5,
-            audio_length_in_s=5.0  # Keep audio generation short
-        ).audios[0]
-        # Post-processing
-        audio = audio.squeeze()  # Handle mono channel
-        audio = np.clip(audio, -1, 1)  # Ensure valid range
-        return (16000, audio)
     except Exception as e:
-        print(f"Audio generation error: {str(e)}")
         return None
-    finally:
-        audio_pipe.to(original_device)
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-# UI Components
 css = """
-#col-container {
-    max-width: 800px;
     margin: 0 auto;
-}
-.disclaimer {
-    font-size: 0.9em;
-    color: #666;
-}
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-            <h1 style="text-align: center;">🎶 Image to Sound Effect Generator</h1>
-            <p style="text-align: center;">
-                ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-            </p>
         """)
-        with gr.Row():
-            image_input = gr.Image(type="filepath", label="Upload Image")
-            caption_output = gr.Textbox(label="Generated Description", interactive=False)
-        with gr.Row():
-            generate_btn = gr.Button("Generate Description", variant="primary")
-            audio_output = gr.Audio(label="Generated Sound", interactive=False)
-            sound_btn = gr.Button("Generate Sound", variant="secondary")
-        gr.Examples(
-            examples=[str(Path(__file__).parent / "examples" / f) for f in ["storm.jpg", "city.jpg"]],
-            inputs=image_input,
-            outputs=[caption_output, audio_output],
-            fn=lambda x: (analyze_image(Path(x).read_bytes())[0], None),
-            cache_examples=True
-        )
-        gr.Markdown("### 🛠️ Usage Tips")
-        gr.Markdown("""
-            - Use clear, high-contrast images for best results
-            - Complex scenes may require multiple generations
-            - Keep sound generation under 10 seconds for quick results
-        """)
-        gr.Markdown("### ⚠️ Disclaimer", elem_classes="disclaimer")
-        gr.Markdown("""
-            Generated content may not always be accurate. Use at your own discretion.
-            [Privacy Policy](https://bilsimaging.com/privacy) |
-            [Terms of Service](https://bilsimaging.com/terms)
-        """)
-    # Event handling
-    generate_btn.click(
-        fn=lambda x: analyze_image(Path(x).read_bytes())[0],
-        inputs=image_input,
-        outputs=caption_output,
-        api_name="describe"
-    )
-    sound_btn.click(
-        fn=generate_audio,
-        inputs=caption_output,
-        outputs=audio_output,
-        api_name="generate_sound"
     )
-    # Input validation
-    image_input.change(
-        fn=lambda: [gr.update(value=""), gr.update(value=None)],
-        outputs=[caption_output, audio_output]
     )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")

+import spaces
+import os
+import tempfile
 import gradio as gr
 from dotenv import load_dotenv
+import torch
+from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+device_id = 0 if torch.cuda.is_available() else -1
+captioning_pipeline = pipeline(
+    "image-to-text",
+    model="nlpconnect/vit-gpt2-image-captioning",
+    device=device_id
+)
+pipe = DiffusionPipeline.from_pretrained(
+    "cvssp/audioldm2",
+    use_auth_token=hf_token
+)
+@spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file):
     try:
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+            temp_file.write(image_file)
+            temp_image_path = temp_file.name
+        results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
+            return "Error: Could not generate caption.", True
+        caption = results[0].get("generated_text", "").strip()
+        if not caption:
+            return "No caption was generated.", True
+        return caption, False
     except Exception as e:
+        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
+def get_audioldm_from_caption(caption):
     try:
+        pipe.to("cuda")
+        audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
+            guidance_scale=7.5
+        )
+        pipe.to("cpu")
+        audio = audio_output.audios[0]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+            write(temp_wav.name, 16000, audio)
+            return temp_wav.name
     except Exception as e:
+        print(f"Error generating audio from caption: {e}")
         return None
 css = """
+#col-container{
     margin: 0 auto;
+    max-width: 800px;
+    }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
+    <p style="text-align: center;">
+        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+    </p>
         """)
+    gr.Markdown("""
+    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
+    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
+    **💡 How it works:**
+    1. **Upload an image**: Choose an image that you'd like to analyze.
+    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
+    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
+       sound effect that matches the image context.
+    Enjoy the journey from visual to auditory sensation with just a few clicks!
+    """)
+    image_upload = gr.File(label="Upload Image", type="binary")
+    generate_description_button = gr.Button("Generate Description")
+    caption_display = gr.Textbox(label="Image Description", interactive=False)
+    generate_sound_button = gr.Button("Generate Sound Effect")
+    audio_output = gr.Audio(label="Generated Sound Effect")
+    gr.Markdown("""
+    ## 👥 How You Can Contribute
+    We welcome contributions and suggestions for improvements. Your feedback is invaluable
+    to the continuous enhancement of this application.
+    For support, questions, or to contribute, please contact us at
+    [[email protected]](mailto:[email protected]).
+    Support our work and get involved by donating through
+    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
+    """)
+    gr.Markdown("""
+    ## 📢 Stay Connected
+    This app is a testament to the creative possibilities that emerge when technology meets art.
+    Enjoy exploring the auditory landscape of your images!
+    """)
+    def update_caption(image_file):
+        description, _ = analyze_image_with_free_model(image_file)
+        return description
+    def generate_sound(description):
+        if not description or description.startswith("Error"):
+            return None
+        audio_path = get_audioldm_from_caption(description)
+        return audio_path
+    generate_description_button.click(
+        fn=update_caption,
+        inputs=image_upload,
+        outputs=caption_display
     )
+    generate_sound_button.click(
+        fn=generate_sound,
+        inputs=caption_display,
+        outputs=audio_output
     )
+    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
+    html = gr.HTML()
+demo.launch(debug=True, share=True)