Bils commited on
Commit
a9aa30e
·
verified ·
1 Parent(s): 041bd28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -22
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import spaces
2
  import os
 
3
  import tempfile
4
  import gradio as gr
5
  from dotenv import load_dotenv
@@ -8,54 +8,98 @@ from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
 
 
 
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
 
 
15
  device_id = 0 if torch.cuda.is_available() else -1
16
 
 
17
  captioning_pipeline = pipeline(
18
  "image-to-text",
19
  model="nlpconnect/vit-gpt2-image-captioning",
20
  device=device_id
21
  )
22
 
 
23
  pipe = DiffusionPipeline.from_pretrained(
24
  "cvssp/audioldm2",
25
  use_auth_token=hf_token
26
  )
 
27
 
28
  @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
 
 
 
 
 
 
 
 
30
  try:
 
 
 
 
 
 
 
 
 
 
 
31
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
  temp_file.write(image_file)
33
  temp_image_path = temp_file.name
34
 
 
35
  results = captioning_pipeline(temp_image_path)
36
  if not results or not isinstance(results, list):
37
- return "Error: Could not generate caption.", True
38
 
 
39
  caption = results[0].get("generated_text", "").strip()
40
  if not caption:
41
- return "No caption was generated.", True
 
42
  return caption, False
43
 
44
  except Exception as e:
45
  return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
- def get_audioldm_from_caption(caption):
 
 
 
 
 
 
49
  try:
50
- pipe.to("cuda")
 
 
 
51
  audio_output = pipe(
52
  prompt=caption,
53
  num_inference_steps=50,
54
  guidance_scale=7.5
55
  )
 
 
56
  pipe.to("cpu")
 
 
57
  audio = audio_output.audios[0]
58
 
 
59
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
  write(temp_wav.name, 16000, audio)
61
  return temp_wav.name
@@ -64,6 +108,8 @@ def get_audioldm_from_caption(caption):
64
  print(f"Error generating audio from caption: {e}")
65
  return None
66
 
 
 
67
  css = """
68
  #col-container{
69
  margin: 0 auto;
@@ -74,25 +120,28 @@ css = """
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
76
  gr.HTML("""
77
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
- <p style="text-align: center;">
79
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
- </p>
81
  """)
82
 
83
  gr.Markdown("""
84
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
-
 
87
  **💡 How it works:**
88
  1. **Upload an image**: Choose an image that you'd like to analyze.
89
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
- sound effect that matches the image context.
 
92
 
93
  Enjoy the journey from visual to auditory sensation with just a few clicks!
94
  """)
95
 
 
96
  image_upload = gr.File(label="Upload Image", type="binary")
97
  generate_description_button = gr.Button("Generate Description")
98
  caption_display = gr.Textbox(label="Image Description", interactive=False)
@@ -113,20 +162,26 @@ with gr.Blocks(css=css) as demo:
113
 
114
  gr.Markdown("""
115
  ## 📢 Stay Connected
116
- This app is a testament to the creative possibilities that emerge when technology meets art.
117
- Enjoy exploring the auditory landscape of your images!
118
  """)
119
 
 
120
  def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
 
 
 
122
  return description
123
 
124
  def generate_sound(description):
 
125
  if not description or description.startswith("Error"):
126
  return None
127
  audio_path = get_audioldm_from_caption(description)
128
  return audio_path
129
 
 
130
  generate_description_button.click(
131
  fn=update_caption,
132
  inputs=image_upload,
@@ -138,8 +193,16 @@ with gr.Blocks(css=css) as demo:
138
  inputs=caption_display,
139
  outputs=audio_output
140
  )
141
-
142
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
 
 
 
 
 
 
 
143
  html = gr.HTML()
144
 
145
- demo.launch(debug=True, share=True)
 
 
 
1
  import os
2
+ import io
3
  import tempfile
4
  import gradio as gr
5
  from dotenv import load_dotenv
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
+ from PIL import Image
12
+
13
+ import spaces
14
 
15
  load_dotenv()
16
  hf_token = os.getenv("HF_TKN")
17
 
18
+ # Determine if we have access to a GPU
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  device_id = 0 if torch.cuda.is_available() else -1
21
 
22
+ # Initialize the image captioning pipeline
23
  captioning_pipeline = pipeline(
24
  "image-to-text",
25
  model="nlpconnect/vit-gpt2-image-captioning",
26
  device=device_id
27
  )
28
 
29
+ # Initialize the text-to-audio pipeline
30
  pipe = DiffusionPipeline.from_pretrained(
31
  "cvssp/audioldm2",
32
  use_auth_token=hf_token
33
  )
34
+ pipe.to(device)
35
 
36
  @spaces.GPU(duration=120)
37
+ def analyze_image_with_free_model(image_file: bytes):
38
+ """
39
+ Analyze the uploaded image using the ViT-GPT2 image captioning pipeline.
40
+
41
+ :param image_file: Binary content of the uploaded image.
42
+ :return: A tuple (caption, error_flag).
43
+ caption (str) - The generated caption or error message.
44
+ error_flag (bool) - Indicates if an error occurred.
45
+ """
46
  try:
47
+ # Validate image input
48
+ if not image_file:
49
+ return "Error: No image data received.", True
50
+
51
+ # Check if the file is a valid image
52
+ try:
53
+ Image.open(io.BytesIO(image_file)).verify()
54
+ except Exception:
55
+ return "Error: Invalid image file. Please upload a valid image.", True
56
+
57
+ # Write the valid image to a temporary file for the pipeline
58
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
59
  temp_file.write(image_file)
60
  temp_image_path = temp_file.name
61
 
62
+ # Perform image captioning
63
  results = captioning_pipeline(temp_image_path)
64
  if not results or not isinstance(results, list):
65
+ return "Error: Captioning pipeline returned invalid results.", True
66
 
67
+ # Extract and clean up the generated caption
68
  caption = results[0].get("generated_text", "").strip()
69
  if not caption:
70
+ return "No caption was generated by the model.", True
71
+
72
  return caption, False
73
 
74
  except Exception as e:
75
  return f"Error analyzing image: {e}", True
76
 
77
  @spaces.GPU(duration=120)
78
+ def get_audioldm_from_caption(caption: str):
79
+ """
80
+ Generate an audio file (WAV) from a text caption using the AudioLDM2 pipeline.
81
+
82
+ :param caption: The text prompt used to generate audio.
83
+ :return: The path to the generated .wav file, or None if an error occurred.
84
+ """
85
  try:
86
+ # Move pipeline to GPU (if available)
87
+ pipe.to(device)
88
+
89
+ # Generate audio from text prompt
90
  audio_output = pipe(
91
  prompt=caption,
92
  num_inference_steps=50,
93
  guidance_scale=7.5
94
  )
95
+
96
+ # Move pipeline back to CPU to free GPU memory
97
  pipe.to("cpu")
98
+
99
+ # Extract the first audio sample
100
  audio = audio_output.audios[0]
101
 
102
+ # Write the audio to a temporary WAV file
103
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
104
  write(temp_wav.name, 16000, audio)
105
  return temp_wav.name
 
108
  print(f"Error generating audio from caption: {e}")
109
  return None
110
 
111
+
112
+ # Custom CSS for styling the Gradio Blocks
113
  css = """
114
  #col-container{
115
  margin: 0 auto;
 
120
  with gr.Blocks(css=css) as demo:
121
  with gr.Column(elem_id="col-container"):
122
  gr.HTML("""
123
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
124
+ <p style="text-align: center;">
125
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
126
+ </p>
127
  """)
128
 
129
  gr.Markdown("""
130
+ Welcome to this unique sound effect generator! This tool allows you to upload an image
131
+ and generate a descriptive caption and a corresponding sound effect, all using free,
132
+ open-source models on Hugging Face.
133
+
134
  **💡 How it works:**
135
  1. **Upload an image**: Choose an image that you'd like to analyze.
136
+ 2. **Generate Description**: Click on 'Generate Description' to get a textual
137
+ description of your uploaded image.
138
+ 3. **Generate Sound Effect**: Based on the image description, click on
139
+ 'Generate Sound Effect' to create a sound effect that matches the image context.
140
 
141
  Enjoy the journey from visual to auditory sensation with just a few clicks!
142
  """)
143
 
144
+ # Define Gradio interface elements
145
  image_upload = gr.File(label="Upload Image", type="binary")
146
  generate_description_button = gr.Button("Generate Description")
147
  caption_display = gr.Textbox(label="Image Description", interactive=False)
 
162
 
163
  gr.Markdown("""
164
  ## 📢 Stay Connected
165
+ This app is a testament to the creative possibilities that emerge when
166
+ technology meets art. Enjoy exploring the auditory landscape of your images!
167
  """)
168
 
169
+ # Define the helper functions for Gradio event handlers
170
  def update_caption(image_file):
171
+ description, error_flag = analyze_image_with_free_model(image_file)
172
+ if error_flag:
173
+ # In case of error, just return the error message
174
+ return description
175
  return description
176
 
177
  def generate_sound(description):
178
+ # Validate the description before generating audio
179
  if not description or description.startswith("Error"):
180
  return None
181
  audio_path = get_audioldm_from_caption(description)
182
  return audio_path
183
 
184
+ # Wire the Gradio events to the functions
185
  generate_description_button.click(
186
  fn=update_caption,
187
  inputs=image_upload,
 
193
  inputs=caption_display,
194
  outputs=audio_output
195
  )
196
+
197
+ gr.HTML(
198
+ '<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image">'
199
+ '<img src="https://api.visitorbadge.io/api/visitors?path='
200
+ 'https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" '
201
+ '/></a>'
202
+ )
203
+
204
+ # An extra placeholder if needed
205
  html = gr.HTML()
206
 
207
+ # Enable debug and optional share. On Spaces, 'share=True' is typically ignored.
208
+ demo.launch(debug=True, share=True)