Bils commited on
Commit
698d4cd
·
verified ·
1 Parent(s): 2f95323

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -228
app.py CHANGED
@@ -1,262 +1,145 @@
1
- import gradio as gr
2
  import os
3
  import tempfile
 
 
4
  import torch
5
- import numpy as np
6
  from scipy.io.wavfile import write
7
- from dotenv import load_dotenv
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from PIL import Image
11
- import io
12
- from pydub import AudioSegment
13
- from typing import List
14
- from functools import lru_cache
15
 
16
- # Load environment variables
17
  load_dotenv()
18
- HF_TOKEN = os.getenv("HF_TKN")
19
-
20
- # Device configuration
21
- device = "cuda" if torch.cuda.is_available() else "cpu"
22
 
23
- # Initialize models with caching
24
- @lru_cache(maxsize=None)
25
- def load_caption_model():
26
- return pipeline(
27
- "image-to-text",
28
- model="Salesforce/blip-image-captioning-base",
29
- device=device
30
- )
31
 
32
- @lru_cache(maxsize=None)
33
- def load_audio_model():
34
- pipe = DiffusionPipeline.from_pretrained(
35
- "cvssp/audioldm2",
36
- use_auth_token=HF_TOKEN
37
- )
38
- return pipe
39
 
40
- caption_pipe = load_caption_model()
41
- audio_pipe = load_audio_model().to(device)
 
 
42
 
43
- def analyze_image(image_file):
44
- """Generate caption from image with validation"""
45
  try:
46
- # Validate image
47
- try:
48
- image = Image.open(io.BytesIO(image_file))
49
- image.verify()
50
- image = Image.open(io.BytesIO(image_file))
51
- except Exception as e:
52
- raise ValueError(f"Invalid image file: {str(e)}")
53
 
54
- results = caption_pipe(image)
55
  if not results or not isinstance(results, list):
56
- raise RuntimeError("No caption generated")
57
 
58
  caption = results[0].get("generated_text", "").strip()
59
  if not caption:
60
- raise RuntimeError("Empty caption generated")
61
-
62
- return caption
63
 
64
  except Exception as e:
65
- raise gr.Error(f"Image processing error: {str(e)}")
66
 
67
- def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
68
- """Generate audio from single prompt"""
69
  try:
70
- if not prompt or len(prompt) < 10:
71
- raise ValueError("Prompt must be at least 10 characters")
72
-
73
- with torch.inference_mode():
74
- audio = audio_pipe(
75
- prompt=prompt,
76
- num_inference_steps=int(num_steps),
77
- guidance_scale=guidance_scale,
78
- audio_length_in_s=10
79
- ).audios[0]
80
-
81
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
82
- write(tmpfile.name, 16000, audio)
83
- return tmpfile.name
84
-
85
- except Exception as e:
86
- raise gr.Error(f"Audio generation error: {str(e)}")
87
 
88
- def blend_audios(audio_files: List[str]) -> str:
89
- """Mix multiple audio files into one"""
90
- try:
91
- if not audio_files:
92
- raise ValueError("No audio files to blend")
93
-
94
- # Load first audio to get base parameters
95
- base_audio = AudioSegment.from_wav(audio_files[0])
96
- mixed = base_audio
97
-
98
- # Mix subsequent tracks
99
- for file in audio_files[1:]:
100
- track = AudioSegment.from_wav(file)
101
- if len(track) > len(mixed):
102
- mixed = mixed.overlay(track[:len(mixed)])
103
- else:
104
- mixed = mixed.overlay(track)
105
-
106
- # Export mixed audio
107
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
108
- mixed.export(tmpfile.name, format="wav")
109
- return tmpfile.name
110
-
111
- except Exception as e:
112
- raise gr.Error(f"Audio mixing error: {str(e)}")
113
-
114
- def process_inputs(input_choice, image_file, *prompts):
115
- """Handle both image and text input modes"""
116
- try:
117
- # Filter empty prompts
118
- valid_prompts = [p.strip() for p in prompts if p.strip()]
119
-
120
- if input_choice == "Image":
121
- if not image_file:
122
- raise gr.Error("Please upload an image")
123
- main_prompt = analyze_image(image_file)
124
- valid_prompts = [main_prompt] + valid_prompts
125
- else:
126
- if not valid_prompts:
127
- raise gr.Error("Please enter at least one text prompt")
128
-
129
- # Generate audio for each prompt
130
- audio_files = []
131
- for idx, prompt in enumerate(valid_prompts):
132
- audio_path = generate_audio(prompt)
133
- audio_files.append(audio_path)
134
-
135
- # Blend all audio files
136
- final_audio = blend_audios(audio_files)
137
- return valid_prompts, final_audio, audio_files
138
 
139
  except Exception as e:
140
- raise gr.Error(str(e))
 
141
 
142
- # Gradio interface
143
  css = """
144
- #main-container { max-width: 800px; margin: 0 auto; }
145
- .dark { background: #1a1a1a; }
146
- .prompt-box { margin-bottom: 10px; }
147
- .audio-track { margin: 5px 0; }
148
  """
149
 
150
- with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
151
- with gr.Column(elem_id="main-container"):
152
- gr.Markdown("""
153
- # 🎨 Image to Sound Generator
154
- Transform visual content or text prompts into mixed sound effects!
 
 
155
  """)
156
-
157
- # Input Mode Selector
158
- input_choice = gr.Radio(
159
- choices=["Image", "Text"],
160
- value="Image",
161
- label="Input Mode",
162
- interactive=True
163
- )
164
-
165
- # Image Input Section
166
- with gr.Row(visible=True) as image_row:
167
- image_input = gr.Image(type="filepath", label="Upload Image")
168
-
169
- # Text Input Section
170
- with gr.Column(visible=False) as text_inputs_col:
171
- prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
172
- add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")
173
-
174
- # Dynamic prompt management
175
- current_prompts = gr.State(value=3)
176
-
177
- def add_prompt(current_count):
178
- new_count = current_count + 1
179
- new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
180
- return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)
181
-
182
- add_prompt_btn.click(
183
- fn=add_prompt,
184
- inputs=current_prompts,
185
- outputs=[current_prompts] + prompt_components + [text_inputs_col]
186
- )
187
-
188
- # Toggle between image/text inputs
189
- def toggle_inputs(choice):
190
- if choice == "Image":
191
- return [gr.update(visible=True), gr.update(visible=False)]
192
- return [gr.update(visible=False), gr.update(visible=True)]
193
-
194
- input_choice.change(
195
- fn=toggle_inputs,
196
- inputs=input_choice,
197
- outputs=[image_row, text_inputs_col]
198
- )
199
-
200
- # Generation Controls
201
- with gr.Accordion("Advanced Settings", open=False):
202
- steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
203
- guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")
204
-
205
- generate_btn = gr.Button("Generate Mixed Sound", variant="primary")
206
-
207
- # Outputs
208
- with gr.Column():
209
- gr.Markdown("### Generation Results")
210
- prompt_display = gr.JSON(label="Used Prompts")
211
- final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)
212
-
213
- with gr.Accordion("Individual Tracks", open=False):
214
- track_components = [gr.Audio(visible=False) for _ in range(5)]
215
-
216
- # Examples
217
- gr.Examples(
218
- examples=[
219
- ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
220
- [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
221
- ],
222
- inputs=[image_input] + prompt_components[:2],
223
- outputs=[prompt_display, final_audio],
224
- fn=lambda *x: process_inputs("Image", *x),
225
- cache_examples=True
226
- )
227
 
228
- # Contribution Section
229
- with gr.Column():
230
- gr.Markdown("""
231
- ## 👥 How You Can Contribute
232
- We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
233
- Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
234
- """)
235
- gr.HTML("""
236
- <div style="text-align: center;">
237
- <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
238
- <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
239
- </a>
240
- </div>
241
- """)
242
-
243
- # Footer
244
- gr.Markdown("""
245
- ---
246
- *Powered by [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-base) and
247
- [AudioLDM 2](https://huggingface.co/cvssp/audioldm2) •
248
- [GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
249
- """)
250
-
251
- # Event handling
252
- generate_btn.click(
253
- fn=process_inputs,
254
- inputs=[input_choice, image_input] + prompt_components,
255
- outputs=[prompt_display, final_audio, *track_components]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  )
257
 
258
- # Enable queuing for concurrent processing
259
- app.queue(concurrency_count=3)
 
 
 
 
 
 
260
 
261
- if __name__ == "__main__":
262
- app.launch(debug=True, share=True)
 
1
+ import spaces
2
  import os
3
  import tempfile
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
  import torch
 
7
  from scipy.io.wavfile import write
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from pathlib import Path
 
 
 
 
11
 
 
12
  load_dotenv()
13
+ hf_token = os.getenv("HF_TKN")
 
 
 
14
 
15
+ device_id = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
16
 
17
+ captioning_pipeline = pipeline(
18
+ "image-to-text",
19
+ model="nlpconnect/vit-gpt2-image-captioning",
20
+ device=device_id
21
+ )
 
 
22
 
23
+ pipe = DiffusionPipeline.from_pretrained(
24
+ "cvssp/audioldm2",
25
+ use_auth_token=hf_token
26
+ )
27
 
28
+ @spaces.GPU(duration=120)
29
+ def analyze_image_with_free_model(image_file):
30
  try:
31
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
+ temp_file.write(image_file)
33
+ temp_image_path = temp_file.name
 
 
 
 
34
 
35
+ results = captioning_pipeline(temp_image_path)
36
  if not results or not isinstance(results, list):
37
+ return "Error: Could not generate caption.", True
38
 
39
  caption = results[0].get("generated_text", "").strip()
40
  if not caption:
41
+ return "No caption was generated.", True
42
+ return caption, False
 
43
 
44
  except Exception as e:
45
+ return f"Error analyzing image: {e}", True
46
 
47
+ @spaces.GPU(duration=120)
48
+ def get_audioldm_from_caption(caption):
49
  try:
50
+ pipe.to("cuda")
51
+ audio_output = pipe(
52
+ prompt=caption,
53
+ num_inference_steps=50,
54
+ guidance_scale=7.5
55
+ )
56
+ pipe.to("cpu")
57
+ audio = audio_output.audios[0]
 
 
 
 
 
 
 
 
 
58
 
59
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
+ write(temp_wav.name, 16000, audio)
61
+ return temp_wav.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  except Exception as e:
64
+ print(f"Error generating audio from caption: {e}")
65
+ return None
66
 
 
67
  css = """
68
+ #col-container{
69
+ margin: 0 auto;
70
+ max-width: 800px;
71
+ }
72
  """
73
 
74
+ with gr.Blocks(css=css) as demo:
75
+ with gr.Column(elem_id="col-container"):
76
+ gr.HTML("""
77
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
+ <p style="text-align: center;">
79
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
+ </p>
81
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ gr.Markdown("""
84
+ Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
+ descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
+
87
+ **💡 How it works:**
88
+ 1. **Upload an image**: Choose an image that you'd like to analyze.
89
+ 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
+ 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
+ sound effect that matches the image context.
92
+
93
+ Enjoy the journey from visual to auditory sensation with just a few clicks!
94
+ """)
95
+
96
+ image_upload = gr.File(label="Upload Image", type="binary")
97
+ generate_description_button = gr.Button("Generate Description")
98
+ caption_display = gr.Textbox(label="Image Description", interactive=False)
99
+ generate_sound_button = gr.Button("Generate Sound Effect")
100
+ audio_output = gr.Audio(label="Generated Sound Effect")
101
+
102
+ gr.Markdown("""
103
+ ## 👥 How You Can Contribute
104
+ We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
+ to the continuous enhancement of this application.
106
+
107
+ For support, questions, or to contribute, please contact us at
108
109
+
110
+ Support our work and get involved by donating through
111
+ [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
+ """)
113
+
114
+ gr.Markdown("""
115
+ ## 📢 Stay Connected
116
+ This app is a testament to the creative possibilities that emerge when technology meets art.
117
+ Enjoy exploring the auditory landscape of your images!
118
+ """)
119
+
120
+ def update_caption(image_file):
121
+ description, _ = analyze_image_with_free_model(image_file)
122
+ return description
123
+
124
+ def generate_sound(description):
125
+ if not description or description.startswith("Error"):
126
+ return None
127
+ audio_path = get_audioldm_from_caption(description)
128
+ return audio_path
129
+
130
+ generate_description_button.click(
131
+ fn=update_caption,
132
+ inputs=image_upload,
133
+ outputs=caption_display
134
  )
135
 
136
+ generate_sound_button.click(
137
+ fn=generate_sound,
138
+ inputs=caption_display,
139
+ outputs=audio_output
140
+ )
141
+
142
+ gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
+ html = gr.HTML()
144
 
145
+ demo.launch(debug=True, share=True)