fffiloni commited on
Commit
7125d26
1 Parent(s): f83630e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -16
app.py CHANGED
@@ -3,11 +3,27 @@ import spaces
3
  import json
4
  import re
5
  from gradio_client import Client
 
 
6
 
7
- kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
 
 
 
 
 
 
 
 
 
 
8
 
9
- def get_caption(image_in):
10
 
 
 
 
 
11
  kosmos2_result = kosmos2_client.predict(
12
  image_in, # str (filepath or URL to image) in 'Test Image' Image component
13
  "Detailed", # str in 'Description Type' Radio component
@@ -77,6 +93,22 @@ def get_magnet(prompt):
77
  )
78
  print(result)
79
  return result[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  import re
82
  import torch
@@ -112,15 +144,19 @@ def get_musical_prompt(user_prompt):
112
  print(f"SUGGESTED Musical prompt: {cleaned_text}")
113
  return cleaned_text.lstrip("\n")
114
 
115
- def infer(image_in):
116
  gr.Info("Getting image caption with Kosmos2...")
117
  user_prompt = get_caption(image_in)
118
 
119
  gr.Info("Building a musical prompt according to the image caption ...")
120
  musical_prompt = get_musical_prompt(user_prompt)
121
 
122
- gr.Info("Now calling MAGNet for music ...")
123
- music_o = get_magnet(musical_prompt)
 
 
 
 
124
 
125
  return musical_prompt, music_o
126
 
@@ -149,10 +185,18 @@ with gr.Blocks(css=css) as demo:
149
  type = "filepath",
150
  elem_id = "image-in"
151
  )
 
 
 
 
 
 
 
 
152
  submit_btn = gr.Button("Make music from my pic !")
153
  with gr.Column():
154
  caption = gr.Textbox(
155
- label = "Musical prompt",
156
  max_lines = 3
157
  )
158
  result = gr.Audio(
@@ -161,16 +205,16 @@ with gr.Blocks(css=css) as demo:
161
  with gr.Column():
162
  gr.Examples(
163
  examples = [
164
- ["examples/monalisa.png"],
165
- ["examples/santa.png"],
166
- ["examples/ocean_poet.jpeg"],
167
- ["examples/winter_hiking.png"],
168
- ["examples/teatime.jpeg"],
169
- ["examples/news_experts.jpeg"],
170
- ["examples/chicken_adobo.jpeg"]
171
  ],
172
  fn = infer,
173
- inputs = [image_in],
174
  outputs = [caption, result],
175
  cache_examples = False
176
  )
@@ -178,7 +222,8 @@ with gr.Blocks(css=css) as demo:
178
  submit_btn.click(
179
  fn = infer,
180
  inputs = [
181
- image_in
 
182
  ],
183
  outputs =[
184
  caption,
@@ -186,4 +231,4 @@ with gr.Blocks(css=css) as demo:
186
  ]
187
  )
188
 
189
- demo.queue().launch(show_api=False)
 
3
  import json
4
  import re
5
  from gradio_client import Client
6
+ from moviepy.editor import VideoFileClip
7
+ from moviepy.audio.AudioClip import AudioClip
8
 
9
+ def extract_audio(video_in):
10
+ input_video = video_in
11
+ output_audio = 'audio.wav'
12
+
13
+ # Open the video file and extract the audio
14
+ video_clip = VideoFileClip(input_video)
15
+ audio_clip = video_clip.audio
16
+
17
+ # Save the audio as a .wav file
18
+ audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
19
+ print("Audio extraction complete.")
20
 
21
+ return 'audio.wav'
22
 
23
+
24
+
25
+ def get_caption(image_in):
26
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
27
  kosmos2_result = kosmos2_client.predict(
28
  image_in, # str (filepath or URL to image) in 'Test Image' Image component
29
  "Detailed", # str in 'Description Type' Radio component
 
93
  )
94
  print(result)
95
  return result[1]
96
+
97
+ def get_audioldm(prompt):
98
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
99
+ result = client.predict(
100
+ prompt, # str in 'Input text' Textbox component
101
+ "Low quality.", # str in 'Negative prompt' Textbox component
102
+ 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
103
+ 3.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
104
+ 45, # int | float in 'Seed' Number component
105
+ 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
106
+ fn_index=1
107
+ )
108
+ print(result)
109
+ audio_result = extract_audio(result)
110
+ return audio_result
111
+
112
 
113
  import re
114
  import torch
 
144
  print(f"SUGGESTED Musical prompt: {cleaned_text}")
145
  return cleaned_text.lstrip("\n")
146
 
147
+ def infer(image_in, chosen_model):
148
  gr.Info("Getting image caption with Kosmos2...")
149
  user_prompt = get_caption(image_in)
150
 
151
  gr.Info("Building a musical prompt according to the image caption ...")
152
  musical_prompt = get_musical_prompt(user_prompt)
153
 
154
+ if chosen_model == "MAGNet" :
155
+ gr.Info("Now calling MAGNet for music...")
156
+ music_o = get_magnet(musical_prompt)
157
+ elif chosen_model == "AudioLDM-2" :
158
+ gr.Info("Now calling AudioLDM-2 for music...")
159
+ music_o = get_magnet(musical_prompt)
160
 
161
  return musical_prompt, music_o
162
 
 
185
  type = "filepath",
186
  elem_id = "image-in"
187
  )
188
+ chosen_model = gr.Radio(
189
+ label = "Choose a model",
190
+ choices = [
191
+ "MAGNet",
192
+ "AudioLDM-2"
193
+ ],
194
+ value = "MAGNet"
195
+ )
196
  submit_btn = gr.Button("Make music from my pic !")
197
  with gr.Column():
198
  caption = gr.Textbox(
199
+ label = "Inspirational musical prompt",
200
  max_lines = 3
201
  )
202
  result = gr.Audio(
 
205
  with gr.Column():
206
  gr.Examples(
207
  examples = [
208
+ ["examples/monalisa.png", "MAGNet"],
209
+ ["examples/santa.png", "MAGNet"],
210
+ ["examples/ocean_poet.jpeg", "MAGNet"],
211
+ ["examples/winter_hiking.png", "MAGNet"],
212
+ ["examples/teatime.jpeg", "MAGNet"],
213
+ ["examples/news_experts.jpeg", "MAGNet"],
214
+ ["examples/chicken_adobo.jpeg", "MAGNet"]
215
  ],
216
  fn = infer,
217
+ inputs = [image_in, chosen_model],
218
  outputs = [caption, result],
219
  cache_examples = False
220
  )
 
222
  submit_btn.click(
223
  fn = infer,
224
  inputs = [
225
+ image_in,
226
+ chosen_model
227
  ],
228
  outputs =[
229
  caption,
 
231
  ]
232
  )
233
 
234
+ demo.queue(max_size=16).launch(show_api=False)