Bils commited on
Commit
4d9e689
·
verified ·
1 Parent(s): bacf407

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -109
app.py CHANGED
@@ -1,145 +1,260 @@
1
- import spaces
2
  import os
3
  import tempfile
4
- import gradio as gr
5
- from dotenv import load_dotenv
6
  import torch
 
7
  from scipy.io.wavfile import write
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from pathlib import Path
 
 
 
 
11
 
 
12
  load_dotenv()
13
- hf_token = os.getenv("HF_TKN")
 
 
 
14
 
15
- device_id = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
16
 
17
- captioning_pipeline = pipeline(
18
- "image-to-text",
19
- model="nlpconnect/vit-gpt2-image-captioning",
20
- device=device_id
21
- )
 
 
22
 
23
- pipe = DiffusionPipeline.from_pretrained(
24
- "cvssp/audioldm2",
25
- use_auth_token=hf_token
26
- )
27
 
28
  @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
 
30
  try:
31
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
- temp_file.write(image_file)
33
- temp_image_path = temp_file.name
 
 
 
 
34
 
35
- results = captioning_pipeline(temp_image_path)
36
  if not results or not isinstance(results, list):
37
- return "Error: Could not generate caption.", True
38
 
39
  caption = results[0].get("generated_text", "").strip()
40
  if not caption:
41
- return "No caption was generated.", True
42
- return caption, False
 
43
 
44
  except Exception as e:
45
- return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
- def get_audioldm_from_caption(caption):
 
49
  try:
50
- pipe.to("cuda")
51
- audio_output = pipe(
52
- prompt=caption,
53
- num_inference_steps=50,
54
- guidance_scale=7.5
55
- )
56
- pipe.to("cpu")
57
- audio = audio_output.audios[0]
 
 
58
 
59
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
- write(temp_wav.name, 16000, audio)
61
- return temp_wav.name
62
 
63
  except Exception as e:
64
- print(f"Error generating audio from caption: {e}")
65
- return None
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  css = """
68
- #col-container{
69
- margin: 0 auto;
70
- max-width: 800px;
71
- }
72
  """
73
 
74
- with gr.Blocks(css=css) as demo:
75
- with gr.Column(elem_id="col-container"):
76
- gr.HTML("""
77
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
- <p style="text-align: center;">
79
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
- </p>
81
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- gr.Markdown("""
84
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
-
87
- **💡 How it works:**
88
- 1. **Upload an image**: Choose an image that you'd like to analyze.
89
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
- sound effect that matches the image context.
92
-
93
- Enjoy the journey from visual to auditory sensation with just a few clicks!
94
- """)
95
-
96
- image_upload = gr.File(label="Upload Image", type="binary")
97
- generate_description_button = gr.Button("Generate Description")
98
- caption_display = gr.Textbox(label="Image Description", interactive=False)
99
- generate_sound_button = gr.Button("Generate Sound Effect")
100
- audio_output = gr.Audio(label="Generated Sound Effect")
101
-
102
- gr.Markdown("""
103
- ## 👥 How You Can Contribute
104
- We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
- to the continuous enhancement of this application.
106
-
107
- For support, questions, or to contribute, please contact us at
108
109
-
110
- Support our work and get involved by donating through
111
- [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
- """)
113
-
114
- gr.Markdown("""
115
- ## 📢 Stay Connected
116
- This app is a testament to the creative possibilities that emerge when technology meets art.
117
- Enjoy exploring the auditory landscape of your images!
118
- """)
119
-
120
- def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
122
- return description
123
-
124
- def generate_sound(description):
125
- if not description or description.startswith("Error"):
126
- return None
127
- audio_path = get_audioldm_from_caption(description)
128
- return audio_path
129
-
130
- generate_description_button.click(
131
- fn=update_caption,
132
- inputs=image_upload,
133
- outputs=caption_display
134
- )
135
 
136
- generate_sound_button.click(
137
- fn=generate_sound,
138
- inputs=caption_display,
139
- outputs=audio_output
 
140
  )
141
-
142
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
- html = gr.HTML()
144
 
145
- demo.launch(debug=True, share=True)
 
 
1
+ import gradio as gr
2
  import os
3
  import tempfile
 
 
4
  import torch
5
+ import numpy as np
6
  from scipy.io.wavfile import write
7
+ from dotenv import load_dotenv
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from PIL import Image
11
+ import io
12
+ from pydub import AudioSegment
13
+ from typing import List
14
+ from huggingface_hub import spaces
15
 
16
+ # Load environment variables
17
  load_dotenv()
18
+ HF_TOKEN = os.getenv("HF_TKN")
19
+
20
+ # Device configuration
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
 
23
+ # Initialize models
24
+ @gr.cache()
25
+ def load_caption_model():
26
+ return pipeline(
27
+ "image-to-text",
28
+ model="Salesforce/blip-image-captioning-base",
29
+ device=device
30
+ )
31
 
32
+ @gr.cache()
33
+ def load_audio_model():
34
+ pipe = DiffusionPipeline.from_pretrained(
35
+ "cvssp/audioldm2",
36
+ use_auth_token=HF_TOKEN
37
+ )
38
+ return pipe
39
 
40
+ caption_pipe = load_caption_model()
41
+ audio_pipe = load_audio_model().to(device)
 
 
42
 
43
  @spaces.GPU(duration=120)
44
+ def analyze_image(image_file):
45
+ """Generate caption from image with validation"""
46
  try:
47
+ # Validate image
48
+ try:
49
+ image = Image.open(io.BytesIO(image_file))
50
+ image.verify()
51
+ image = Image.open(io.BytesIO(image_file))
52
+ except Exception as e:
53
+ raise ValueError(f"Invalid image file: {str(e)}")
54
 
55
+ results = caption_pipe(image)
56
  if not results or not isinstance(results, list):
57
+ raise RuntimeError("No caption generated")
58
 
59
  caption = results[0].get("generated_text", "").strip()
60
  if not caption:
61
+ raise RuntimeError("Empty caption generated")
62
+
63
+ return caption
64
 
65
  except Exception as e:
66
+ raise gr.Error(f"Image processing error: {str(e)}")
67
 
68
  @spaces.GPU(duration=120)
69
+ def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
70
+ """Generate audio from single prompt"""
71
  try:
72
+ if not prompt or len(prompt) < 10:
73
+ raise ValueError("Prompt must be at least 10 characters")
74
+
75
+ with torch.inference_mode():
76
+ audio = audio_pipe(
77
+ prompt=prompt,
78
+ num_inference_steps=int(num_steps),
79
+ guidance_scale=guidance_scale,
80
+ audio_length_in_s=10
81
+ ).audios[0]
82
 
83
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
84
+ write(tmpfile.name, 16000, audio)
85
+ return tmpfile.name
86
 
87
  except Exception as e:
88
+ raise gr.Error(f"Audio generation error: {str(e)}")
 
89
 
90
+ @spaces.GPU(duration=120)
91
+ def blend_audios(audio_files: List[str]) -> str:
92
+ """Mix multiple audio files into one"""
93
+ try:
94
+ if not audio_files:
95
+ raise ValueError("No audio files to blend")
96
+
97
+ # Load first audio to get base parameters
98
+ base_audio = AudioSegment.from_wav(audio_files[0])
99
+ mixed = base_audio
100
+
101
+ # Mix subsequent tracks
102
+ for file in audio_files[1:]:
103
+ track = AudioSegment.from_wav(file)
104
+ if len(track) > len(mixed):
105
+ mixed = mixed.overlay(track[:len(mixed)])
106
+ else:
107
+ mixed = mixed.overlay(track)
108
+
109
+ # Export mixed audio
110
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
111
+ mixed.export(tmpfile.name, format="wav")
112
+ return tmpfile.name
113
+
114
+ except Exception as e:
115
+ raise gr.Error(f"Audio mixing error: {str(e)}")
116
+
117
+ def process_inputs(input_choice, image_file, *prompts):
118
+ """Handle both image and text input modes"""
119
+ try:
120
+ # Filter empty prompts
121
+ valid_prompts = [p.strip() for p in prompts if p.strip()]
122
+
123
+ if input_choice == "Image":
124
+ if not image_file:
125
+ raise gr.Error("Please upload an image")
126
+ main_prompt = analyze_image(image_file)
127
+ valid_prompts = [main_prompt] + valid_prompts
128
+ else:
129
+ if not valid_prompts:
130
+ raise gr.Error("Please enter at least one text prompt")
131
+
132
+ # Generate audio for each prompt
133
+ audio_files = []
134
+ for idx, prompt in enumerate(valid_prompts):
135
+ audio_path = generate_audio(prompt)
136
+ audio_files.append(audio_path)
137
+
138
+ # Blend all audio files
139
+ final_audio = blend_audios(audio_files)
140
+ return valid_prompts, final_audio, audio_files
141
+
142
+ except Exception as e:
143
+ raise gr.Error(str(e))
144
+
145
+ # Gradio interface
146
  css = """
147
+ #main-container { max-width: 800px; margin: 0 auto; }
148
+ .dark { background: #1a1a1a; }
149
+ .prompt-box { margin-bottom: 10px; }
150
+ .audio-track { margin: 5px 0; }
151
  """
152
 
153
+ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
154
+ with gr.Column(elem_id="main-container"):
155
+ gr.Markdown("""
156
+ # 🎨 Image to Sound Generator
157
+ Transform visual content or text prompts into mixed sound effects!
 
 
158
  """)
159
+
160
+ # Input Mode Selector
161
+ input_choice = gr.Radio(
162
+ choices=["Image", "Text"],
163
+ value="Image",
164
+ label="Input Mode",
165
+ interactive=True
166
+ )
167
+
168
+ # Image Input Section
169
+ with gr.Row(visible=True) as image_row:
170
+ image_input = gr.Image(type="filepath", label="Upload Image")
171
+
172
+ # Text Input Section
173
+ with gr.Column(visible=False) as text_inputs_col:
174
+ prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
175
+ add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")
176
+
177
+ # Dynamic prompt management
178
+ current_prompts = gr.State(value=3)
179
+
180
+ def add_prompt(current_count):
181
+ new_count = current_count + 1
182
+ new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
183
+ return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)
184
+
185
+ add_prompt_btn.click(
186
+ fn=add_prompt,
187
+ inputs=current_prompts,
188
+ outputs=[current_prompts] + prompt_components + [text_inputs_col]
189
+ )
190
+
191
+ # Toggle between image/text inputs
192
+ def toggle_inputs(choice):
193
+ if choice == "Image":
194
+ return [gr.update(visible=True), gr.update(visible=False)]
195
+ return [gr.update(visible=False), gr.update(visible=True)]
196
+
197
+ input_choice.change(
198
+ fn=toggle_inputs,
199
+ inputs=input_choice,
200
+ outputs=[image_row, text_inputs_col]
201
+ )
202
+
203
+ # Generation Controls
204
+ with gr.Accordion("Advanced Settings", open=False):
205
+ steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
206
+ guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")
207
+
208
+ generate_btn = gr.Button("Generate Mixed Sound", variant="primary")
209
+
210
+ # Outputs
211
+ with gr.Column():
212
+ gr.Markdown("### Generation Results")
213
+ prompt_display = gr.JSON(label="Used Prompts")
214
+ final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)
215
+
216
+ with gr.Accordion("Individual Tracks", open=False):
217
+ track_components = [gr.Audio(visible=False) for _ in range(5)]
218
+
219
+ # Examples
220
+ gr.Examples(
221
+ examples=[
222
+ ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
223
+ [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
224
+ ],
225
+ inputs=[image_input] + prompt_components[:2],
226
+ outputs=[prompt_display, final_audio],
227
+ fn=lambda *x: process_inputs("Image", *x),
228
+ cache_examples=True
229
+ )
230
 
231
+ # Contribution Section
232
+ with gr.Column():
233
+ gr.Markdown("""
234
+ ## 👥 How You Can Contribute
235
+ We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
236
+ Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
237
+ """)
238
+ gr.HTML("""
239
+ <div style="text-align: center;">
240
+ <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
241
+ <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
242
+ </a>
243
+ </div>
244
+ """)
245
+
246
+ # Footer
247
+ gr.Markdown("""
248
+ ---
249
+ [GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
250
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ # Event handling
253
+ generate_btn.click(
254
+ fn=process_inputs,
255
+ inputs=[input_choice, image_input] + prompt_components,
256
+ outputs=[prompt_display, final_audio, *track_components]
257
  )
 
 
 
258
 
259
+ if __name__ == "__main__":
260
+ app.launch(debug=True, share=True)