Bils commited on
Commit
63f345f
·
verified ·
1 Parent(s): a9aa30e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -85
app.py CHANGED
@@ -1,5 +1,5 @@
 
1
  import os
2
- import io
3
  import tempfile
4
  import gradio as gr
5
  from dotenv import load_dotenv
@@ -8,98 +8,54 @@ from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
- from PIL import Image
12
-
13
- import spaces
14
 
15
  load_dotenv()
16
  hf_token = os.getenv("HF_TKN")
17
 
18
- # Determine if we have access to a GPU
19
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  device_id = 0 if torch.cuda.is_available() else -1
21
 
22
- # Initialize the image captioning pipeline
23
  captioning_pipeline = pipeline(
24
  "image-to-text",
25
  model="nlpconnect/vit-gpt2-image-captioning",
26
  device=device_id
27
  )
28
 
29
- # Initialize the text-to-audio pipeline
30
  pipe = DiffusionPipeline.from_pretrained(
31
  "cvssp/audioldm2",
32
  use_auth_token=hf_token
33
  )
34
- pipe.to(device)
35
 
36
  @spaces.GPU(duration=120)
37
- def analyze_image_with_free_model(image_file: bytes):
38
- """
39
- Analyze the uploaded image using the ViT-GPT2 image captioning pipeline.
40
-
41
- :param image_file: Binary content of the uploaded image.
42
- :return: A tuple (caption, error_flag).
43
- caption (str) - The generated caption or error message.
44
- error_flag (bool) - Indicates if an error occurred.
45
- """
46
  try:
47
- # Validate image input
48
- if not image_file:
49
- return "Error: No image data received.", True
50
-
51
- # Check if the file is a valid image
52
- try:
53
- Image.open(io.BytesIO(image_file)).verify()
54
- except Exception:
55
- return "Error: Invalid image file. Please upload a valid image.", True
56
-
57
- # Write the valid image to a temporary file for the pipeline
58
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
59
  temp_file.write(image_file)
60
  temp_image_path = temp_file.name
61
 
62
- # Perform image captioning
63
  results = captioning_pipeline(temp_image_path)
64
  if not results or not isinstance(results, list):
65
- return "Error: Captioning pipeline returned invalid results.", True
66
 
67
- # Extract and clean up the generated caption
68
  caption = results[0].get("generated_text", "").strip()
69
  if not caption:
70
- return "No caption was generated by the model.", True
71
-
72
  return caption, False
73
 
74
  except Exception as e:
75
  return f"Error analyzing image: {e}", True
76
 
77
  @spaces.GPU(duration=120)
78
- def get_audioldm_from_caption(caption: str):
79
- """
80
- Generate an audio file (WAV) from a text caption using the AudioLDM2 pipeline.
81
-
82
- :param caption: The text prompt used to generate audio.
83
- :return: The path to the generated .wav file, or None if an error occurred.
84
- """
85
  try:
86
- # Move pipeline to GPU (if available)
87
- pipe.to(device)
88
-
89
- # Generate audio from text prompt
90
  audio_output = pipe(
91
  prompt=caption,
92
  num_inference_steps=50,
93
  guidance_scale=7.5
94
  )
95
-
96
- # Move pipeline back to CPU to free GPU memory
97
  pipe.to("cpu")
98
-
99
- # Extract the first audio sample
100
  audio = audio_output.audios[0]
101
 
102
- # Write the audio to a temporary WAV file
103
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
104
  write(temp_wav.name, 16000, audio)
105
  return temp_wav.name
@@ -108,8 +64,6 @@ def get_audioldm_from_caption(caption: str):
108
  print(f"Error generating audio from caption: {e}")
109
  return None
110
 
111
-
112
- # Custom CSS for styling the Gradio Blocks
113
  css = """
114
  #col-container{
115
  margin: 0 auto;
@@ -120,28 +74,25 @@ css = """
120
  with gr.Blocks(css=css) as demo:
121
  with gr.Column(elem_id="col-container"):
122
  gr.HTML("""
123
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
124
- <p style="text-align: center;">
125
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
126
- </p>
127
  """)
128
 
129
  gr.Markdown("""
130
- Welcome to this unique sound effect generator! This tool allows you to upload an image
131
- and generate a descriptive caption and a corresponding sound effect, all using free,
132
- open-source models on Hugging Face.
133
-
134
  **💡 How it works:**
135
  1. **Upload an image**: Choose an image that you'd like to analyze.
136
- 2. **Generate Description**: Click on 'Generate Description' to get a textual
137
- description of your uploaded image.
138
- 3. **Generate Sound Effect**: Based on the image description, click on
139
- 'Generate Sound Effect' to create a sound effect that matches the image context.
140
 
141
  Enjoy the journey from visual to auditory sensation with just a few clicks!
142
  """)
143
 
144
- # Define Gradio interface elements
145
  image_upload = gr.File(label="Upload Image", type="binary")
146
  generate_description_button = gr.Button("Generate Description")
147
  caption_display = gr.Textbox(label="Image Description", interactive=False)
@@ -162,26 +113,20 @@ with gr.Blocks(css=css) as demo:
162
 
163
  gr.Markdown("""
164
  ## 📢 Stay Connected
165
- This app is a testament to the creative possibilities that emerge when
166
- technology meets art. Enjoy exploring the auditory landscape of your images!
167
  """)
168
 
169
- # Define the helper functions for Gradio event handlers
170
  def update_caption(image_file):
171
- description, error_flag = analyze_image_with_free_model(image_file)
172
- if error_flag:
173
- # In case of error, just return the error message
174
- return description
175
  return description
176
 
177
  def generate_sound(description):
178
- # Validate the description before generating audio
179
  if not description or description.startswith("Error"):
180
  return None
181
  audio_path = get_audioldm_from_caption(description)
182
  return audio_path
183
 
184
- # Wire the Gradio events to the functions
185
  generate_description_button.click(
186
  fn=update_caption,
187
  inputs=image_upload,
@@ -193,16 +138,8 @@ with gr.Blocks(css=css) as demo:
193
  inputs=caption_display,
194
  outputs=audio_output
195
  )
196
-
197
- gr.HTML(
198
- '<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image">'
199
- '<img src="https://api.visitorbadge.io/api/visitors?path='
200
- 'https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" '
201
- '/></a>'
202
- )
203
-
204
- # An extra placeholder if needed
205
  html = gr.HTML()
206
 
207
- # Enable debug and optional share. On Spaces, 'share=True' is typically ignored.
208
- demo.launch(debug=True, share=True)
 
1
+ import spaces
2
  import os
 
3
  import tempfile
4
  import gradio as gr
5
  from dotenv import load_dotenv
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
 
 
 
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
 
 
15
  device_id = 0 if torch.cuda.is_available() else -1
16
 
 
17
  captioning_pipeline = pipeline(
18
  "image-to-text",
19
  model="nlpconnect/vit-gpt2-image-captioning",
20
  device=device_id
21
  )
22
 
 
23
  pipe = DiffusionPipeline.from_pretrained(
24
  "cvssp/audioldm2",
25
  use_auth_token=hf_token
26
  )
 
27
 
28
  @spaces.GPU(duration=120)
29
+ def analyze_image_with_free_model(image_file):
 
 
 
 
 
 
 
 
30
  try:
 
 
 
 
 
 
 
 
 
 
 
31
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
  temp_file.write(image_file)
33
  temp_image_path = temp_file.name
34
 
 
35
  results = captioning_pipeline(temp_image_path)
36
  if not results or not isinstance(results, list):
37
+ return "Error: Could not generate caption.", True
38
 
 
39
  caption = results[0].get("generated_text", "").strip()
40
  if not caption:
41
+ return "No caption was generated.", True
 
42
  return caption, False
43
 
44
  except Exception as e:
45
  return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
+ def get_audioldm_from_caption(caption):
 
 
 
 
 
 
49
  try:
50
+ pipe.to("cuda")
 
 
 
51
  audio_output = pipe(
52
  prompt=caption,
53
  num_inference_steps=50,
54
  guidance_scale=7.5
55
  )
 
 
56
  pipe.to("cpu")
 
 
57
  audio = audio_output.audios[0]
58
 
 
59
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
  write(temp_wav.name, 16000, audio)
61
  return temp_wav.name
 
64
  print(f"Error generating audio from caption: {e}")
65
  return None
66
 
 
 
67
  css = """
68
  #col-container{
69
  margin: 0 auto;
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
76
  gr.HTML("""
77
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
+ <p style="text-align: center;">
79
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
+ </p>
81
  """)
82
 
83
  gr.Markdown("""
84
+ Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
+ descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
+
 
87
  **💡 How it works:**
88
  1. **Upload an image**: Choose an image that you'd like to analyze.
89
+ 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
+ 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
+ sound effect that matches the image context.
 
92
 
93
  Enjoy the journey from visual to auditory sensation with just a few clicks!
94
  """)
95
 
 
96
  image_upload = gr.File(label="Upload Image", type="binary")
97
  generate_description_button = gr.Button("Generate Description")
98
  caption_display = gr.Textbox(label="Image Description", interactive=False)
 
113
 
114
  gr.Markdown("""
115
  ## 📢 Stay Connected
116
+ This app is a testament to the creative possibilities that emerge when technology meets art.
117
+ Enjoy exploring the auditory landscape of your images!
118
  """)
119
 
 
120
  def update_caption(image_file):
121
+ description, _ = analyze_image_with_free_model(image_file)
 
 
 
122
  return description
123
 
124
  def generate_sound(description):
 
125
  if not description or description.startswith("Error"):
126
  return None
127
  audio_path = get_audioldm_from_caption(description)
128
  return audio_path
129
 
 
130
  generate_description_button.click(
131
  fn=update_caption,
132
  inputs=image_upload,
 
138
  inputs=caption_display,
139
  outputs=audio_output
140
  )
141
+
142
+ gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
 
 
 
 
 
 
 
143
  html = gr.HTML()
144
 
145
+ demo.launch(debug=True, share=True)