Bils commited on
Commit
041bd28
·
verified ·
1 Parent(s): 6a5d04a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -127
app.py CHANGED
@@ -1,165 +1,145 @@
1
- import io
2
- from pathlib import Path
3
- from typing import Tuple, Optional
4
  import gradio as gr
5
- import numpy as np
6
- import torch
7
- from PIL import Image
8
  from dotenv import load_dotenv
 
 
9
  from diffusers import DiffusionPipeline
10
  from transformers import pipeline
11
- from huggingface_hub import login
12
- import os
13
 
14
- # Load environment variables
15
  load_dotenv()
16
  hf_token = os.getenv("HF_TKN")
17
- if hf_token:
18
- login(token=hf_token)
19
-
20
- # Device configuration
21
- device = "cuda" if torch.cuda.is_available() else "cpu"
22
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
-
24
- # Load models
25
- @spaces.GPU
26
- def load_models():
27
- """Load both models with proper device placement"""
28
- caption_pipe = pipeline(
29
- "image-to-text",
30
- model="nlpconnect/vit-gpt2-image-captioning",
31
- device=device
32
- )
33
 
34
- audio_pipe = DiffusionPipeline.from_pretrained(
35
- "cvssp/audioldm2",
36
- token=hf_token,
37
- torch_dtype=torch_dtype
38
- )
39
- return caption_pipe, audio_pipe
40
 
41
- caption_pipe, audio_pipe = load_models()
 
 
 
 
42
 
43
- def analyze_image(image_bytes: bytes) -> Tuple[str, bool]:
44
- """Generate caption from image bytes with enhanced error handling"""
 
 
 
 
 
45
  try:
46
- image = Image.open(io.BytesIO(image_bytes))
47
- if image.mode != "RGB":
48
- image = image.convert("RGB")
49
-
50
- results = caption_pipe(image)
51
-
52
  if not results or not isinstance(results, list):
53
- return "Error: Invalid response from caption model", True
54
-
55
- caption = results[0].get("generated_text", "").strip()
56
- return caption or "No caption generated", not bool(caption)
57
 
 
 
 
 
 
58
  except Exception as e:
59
- return f"Image processing error: {str(e)}", True
60
 
61
  @spaces.GPU(duration=120)
62
- def generate_audio(caption: str) -> Optional[Tuple[int, np.ndarray]]:
63
- """Generate audio from caption with resource management"""
64
  try:
65
- # Device management with context
66
- original_device = next(audio_pipe.parameters()).device
67
- audio_pipe.to(device)
68
-
69
- # Generation with progress awareness
70
- audio = audio_pipe(
71
  prompt=caption,
72
  num_inference_steps=50,
73
- guidance_scale=7.5,
74
- audio_length_in_s=5.0 # Keep audio generation short
75
- ).audios[0]
76
-
77
- # Post-processing
78
- audio = audio.squeeze() # Handle mono channel
79
- audio = np.clip(audio, -1, 1) # Ensure valid range
80
- return (16000, audio)
81
-
82
  except Exception as e:
83
- print(f"Audio generation error: {str(e)}")
84
  return None
85
-
86
- finally:
87
- audio_pipe.to(original_device)
88
- if torch.cuda.is_available():
89
- torch.cuda.empty_cache()
90
 
91
- # UI Components
92
  css = """
93
- #col-container {
94
- max-width: 800px;
95
  margin: 0 auto;
96
- }
97
- .disclaimer {
98
- font-size: 0.9em;
99
- color: #666;
100
- }
101
  """
102
 
103
  with gr.Blocks(css=css) as demo:
104
  with gr.Column(elem_id="col-container"):
105
  gr.HTML("""
106
- <h1 style="text-align: center;">🎶 Image to Sound Effect Generator</h1>
107
- <p style="text-align: center;">
108
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
109
- </p>
110
  """)
111
-
112
- with gr.Row():
113
- image_input = gr.Image(type="filepath", label="Upload Image")
114
- caption_output = gr.Textbox(label="Generated Description", interactive=False)
115
-
116
- with gr.Row():
117
- generate_btn = gr.Button("Generate Description", variant="primary")
118
- audio_output = gr.Audio(label="Generated Sound", interactive=False)
119
- sound_btn = gr.Button("Generate Sound", variant="secondary")
120
-
121
- gr.Examples(
122
- examples=[str(Path(__file__).parent / "examples" / f) for f in ["storm.jpg", "city.jpg"]],
123
- inputs=image_input,
124
- outputs=[caption_output, audio_output],
125
- fn=lambda x: (analyze_image(Path(x).read_bytes())[0], None),
126
- cache_examples=True
127
- )
128
 
129
- gr.Markdown("### 🛠️ Usage Tips")
130
- gr.Markdown("""
131
- - Use clear, high-contrast images for best results
132
- - Complex scenes may require multiple generations
133
- - Keep sound generation under 10 seconds for quick results
134
- """)
 
 
 
 
 
 
135
 
136
- gr.Markdown("### ⚠️ Disclaimer", elem_classes="disclaimer")
137
- gr.Markdown("""
138
- Generated content may not always be accurate. Use at your own discretion.
139
- [Privacy Policy](https://bilsimaging.com/privacy) |
140
- [Terms of Service](https://bilsimaging.com/terms)
141
- """)
142
 
143
- # Event handling
144
- generate_btn.click(
145
- fn=lambda x: analyze_image(Path(x).read_bytes())[0],
146
- inputs=image_input,
147
- outputs=caption_output,
148
- api_name="describe"
149
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- sound_btn.click(
152
- fn=generate_audio,
153
- inputs=caption_output,
154
- outputs=audio_output,
155
- api_name="generate_sound"
156
  )
157
 
158
- # Input validation
159
- image_input.change(
160
- fn=lambda: [gr.update(value=""), gr.update(value=None)],
161
- outputs=[caption_output, audio_output]
162
  )
 
 
 
163
 
164
- if __name__ == "__main__":
165
- demo.launch(server_name="0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")
 
1
+ import spaces
2
+ import os
3
+ import tempfile
4
  import gradio as gr
 
 
 
5
  from dotenv import load_dotenv
6
+ import torch
7
+ from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from pathlib import Path
 
11
 
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ device_id = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
16
 
17
+ captioning_pipeline = pipeline(
18
+ "image-to-text",
19
+ model="nlpconnect/vit-gpt2-image-captioning",
20
+ device=device_id
21
+ )
22
 
23
+ pipe = DiffusionPipeline.from_pretrained(
24
+ "cvssp/audioldm2",
25
+ use_auth_token=hf_token
26
+ )
27
+
28
+ @spaces.GPU(duration=120)
29
+ def analyze_image_with_free_model(image_file):
30
  try:
31
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
+ temp_file.write(image_file)
33
+ temp_image_path = temp_file.name
34
+
35
+ results = captioning_pipeline(temp_image_path)
 
36
  if not results or not isinstance(results, list):
37
+ return "Error: Could not generate caption.", True
 
 
 
38
 
39
+ caption = results[0].get("generated_text", "").strip()
40
+ if not caption:
41
+ return "No caption was generated.", True
42
+ return caption, False
43
+
44
  except Exception as e:
45
+ return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
+ def get_audioldm_from_caption(caption):
 
49
  try:
50
+ pipe.to("cuda")
51
+ audio_output = pipe(
 
 
 
 
52
  prompt=caption,
53
  num_inference_steps=50,
54
+ guidance_scale=7.5
55
+ )
56
+ pipe.to("cpu")
57
+ audio = audio_output.audios[0]
58
+
59
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
+ write(temp_wav.name, 16000, audio)
61
+ return temp_wav.name
62
+
63
  except Exception as e:
64
+ print(f"Error generating audio from caption: {e}")
65
  return None
 
 
 
 
 
66
 
 
67
  css = """
68
+ #col-container{
 
69
  margin: 0 auto;
70
+ max-width: 800px;
71
+ }
 
 
 
72
  """
73
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
76
  gr.HTML("""
77
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
+ <p style="text-align: center;">
79
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
+ </p>
81
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ gr.Markdown("""
84
+ Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
+ descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
+
87
+ **💡 How it works:**
88
+ 1. **Upload an image**: Choose an image that you'd like to analyze.
89
+ 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
+ 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
+ sound effect that matches the image context.
92
+
93
+ Enjoy the journey from visual to auditory sensation with just a few clicks!
94
+ """)
95
 
96
+ image_upload = gr.File(label="Upload Image", type="binary")
97
+ generate_description_button = gr.Button("Generate Description")
98
+ caption_display = gr.Textbox(label="Image Description", interactive=False)
99
+ generate_sound_button = gr.Button("Generate Sound Effect")
100
+ audio_output = gr.Audio(label="Generated Sound Effect")
 
101
 
102
+ gr.Markdown("""
103
+ ## 👥 How You Can Contribute
104
+ We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
+ to the continuous enhancement of this application.
106
+
107
+ For support, questions, or to contribute, please contact us at
108
109
+
110
+ Support our work and get involved by donating through
111
+ [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
+ """)
113
+
114
+ gr.Markdown("""
115
+ ## 📢 Stay Connected
116
+ This app is a testament to the creative possibilities that emerge when technology meets art.
117
+ Enjoy exploring the auditory landscape of your images!
118
+ """)
119
+
120
+ def update_caption(image_file):
121
+ description, _ = analyze_image_with_free_model(image_file)
122
+ return description
123
+
124
+ def generate_sound(description):
125
+ if not description or description.startswith("Error"):
126
+ return None
127
+ audio_path = get_audioldm_from_caption(description)
128
+ return audio_path
129
 
130
+ generate_description_button.click(
131
+ fn=update_caption,
132
+ inputs=image_upload,
133
+ outputs=caption_display
 
134
  )
135
 
136
+ generate_sound_button.click(
137
+ fn=generate_sound,
138
+ inputs=caption_display,
139
+ outputs=audio_output
140
  )
141
+
142
+ gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
+ html = gr.HTML()
144
 
145
+ demo.launch(debug=True, share=True)