js282979 commited on
Commit
75c97f0
·
verified ·
1 Parent(s): ee066bd

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,39 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/chinese-new-year-dragon.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ examples/photoreal-train.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ examples/train-window.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ examples/big-sur.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Video2sfx
3
- emoji: 👀
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.36.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Video SoundFX
3
+ emoji: 👂🎞️
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.36.0
8
+ app_file: app.py
9
+ pinned: true
10
+ short_description: Generates a sound effect that matches video shot
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client
3
+ import os
4
+ import json
5
+ import re
6
+ from moviepy.editor import *
7
+ import cv2
8
+
9
+ hf_token = os.environ.get("HF_TKN")
10
+
11
+ def extract_firstframe(video_in):
12
+ vidcap = cv2.VideoCapture(video_in)
13
+ success,image = vidcap.read()
14
+ count = 0
15
+ while success:
16
+ if count == 0:
17
+ cv2.imwrite("first_frame.jpg", image) # save first extracted frame as jpg file named first_frame.jpg
18
+ else:
19
+ break # exit loop after saving first frame
20
+ success,image = vidcap.read()
21
+ print ('Read a new frame: ', success)
22
+ count += 1
23
+ print ("Done extracted first frame!")
24
+ return "first_frame.jpg"
25
+
26
+ def extract_audio(video_in):
27
+ input_video = video_in
28
+ output_audio = 'audio.wav'
29
+
30
+ # Open the video file and extract the audio
31
+ video_clip = VideoFileClip(input_video)
32
+ audio_clip = video_clip.audio
33
+
34
+ # Save the audio as a .wav file
35
+ audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
36
+ print("Audio extraction complete.")
37
+
38
+ return 'audio.wav'
39
+
40
+ def get_caption_from_kosmos(image_in):
41
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
42
+
43
+ kosmos2_result = kosmos2_client.predict(
44
+ image_in, # str (filepath or URL to image) in 'Test Image' Image component
45
+ "Detailed", # str in 'Description Type' Radio component
46
+ fn_index=4
47
+ )
48
+
49
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
50
+
51
+ with open(kosmos2_result[1], 'r') as f:
52
+ data = json.load(f)
53
+
54
+ reconstructed_sentence = []
55
+ for sublist in data:
56
+ reconstructed_sentence.append(sublist[0])
57
+
58
+ full_sentence = ' '.join(reconstructed_sentence)
59
+ #print(full_sentence)
60
+
61
+ # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
62
+ pattern = r'^Describe this image in detail:\s*(.*)$'
63
+ # Apply the regex pattern to extract the description text.
64
+ match = re.search(pattern, full_sentence)
65
+ if match:
66
+ description = match.group(1)
67
+ print(description)
68
+ else:
69
+ print("Unable to locate valid description.")
70
+
71
+ # Find the last occurrence of "."
72
+ last_period_index = description.rfind('.')
73
+
74
+ # Truncate the string up to the last period
75
+ truncated_caption = description[:last_period_index + 1]
76
+
77
+ # print(truncated_caption)
78
+ print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
79
+
80
+ return truncated_caption
81
+
82
+ def get_caption(image_in):
83
+ client = Client("https://fffiloni-moondream1.hf.space/", hf_token=hf_token)
84
+ result = client.predict(
85
+ image_in, # filepath in 'image' Image component
86
+ "Describe precisely the image in one sentence.", # str in 'Question' Textbox component
87
+ #api_name="/answer_question"
88
+ api_name="/predict"
89
+ )
90
+ print(result)
91
+ return result
92
+
93
+ def get_magnet(prompt):
94
+ amended_prompt = f"{prompt}"
95
+ print(amended_prompt)
96
+ client = Client("https://fffiloni-magnet.hf.space/")
97
+ result = client.predict(
98
+ "facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
99
+ "", # str in 'Model Path (custom models)' Textbox component
100
+ amended_prompt, # str in 'Input Text' Textbox component
101
+ 3, # float in 'Temperature' Number component
102
+ 0.9, # float in 'Top-p' Number component
103
+ 10, # float in 'Max CFG coefficient' Number component
104
+ 1, # float in 'Min CFG coefficient' Number component
105
+ 20, # float in 'Decoding Steps (stage 1)' Number component
106
+ 10, # float in 'Decoding Steps (stage 2)' Number component
107
+ 10, # float in 'Decoding Steps (stage 3)' Number component
108
+ 10, # float in 'Decoding Steps (stage 4)' Number component
109
+ "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
110
+ api_name="/predict_full"
111
+ )
112
+ print(result)
113
+ return result[1]
114
+
115
+ def get_audioldm(prompt):
116
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
117
+ result = client.predict(
118
+ prompt, # str in 'Input text' Textbox component
119
+ "Low quality. Music.", # str in 'Negative prompt' Textbox component
120
+ 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
121
+ 3.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
122
+ 45, # int | float in 'Seed' Number component
123
+ 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
124
+ fn_index=1
125
+ )
126
+ print(result)
127
+ audio_result = extract_audio(result)
128
+ return audio_result
129
+
130
+ def get_audiogen(prompt):
131
+ client = Client("https://fffiloni-audiogen.hf.space/")
132
+ result = client.predict(
133
+ prompt,
134
+ 10,
135
+ api_name="/infer"
136
+ )
137
+ return result
138
+
139
+ def get_tango(prompt):
140
+ try:
141
+ #client = Client("https://declare-lab-tango.hf.space/")
142
+ client = Client("https://fffiloni-tango.hf.space/", hf_token=hf_token)
143
+ except:
144
+ raise gr.Error("Tango space API is not ready, please try again in few minutes ")
145
+
146
+ result = client.predict(
147
+ prompt, # str representing string value in 'Prompt' Textbox component
148
+ 100, # int | float representing numeric value between 100 and 200 in 'Steps' Slider component
149
+ 4, # int | float representing numeric value between 1 and 10 in 'Guidance Scale' Slider component
150
+ api_name="/predict"
151
+ )
152
+ print(result)
153
+ return result
154
+
155
+ def blend_vsfx(video_in, audio_result):
156
+ audioClip = AudioFileClip(audio_result)
157
+ print(f"AUD: {audioClip.duration}")
158
+ clip = VideoFileClip(video_in)
159
+ print(f"VID: {clip.duration}")
160
+ if clip.duration < audioClip.duration :
161
+ audioClip = audioClip.subclip((0.0), (clip.duration))
162
+ elif clip.duration > audioClip.duration :
163
+ clip = clip.subclip((0.0), (audioClip.duration))
164
+ final_clip = clip.set_audio(audioClip)
165
+ # Set the output codec
166
+ codec = 'libx264'
167
+ audio_codec = 'aac'
168
+ final_clip.write_videofile('final_video_with_sound.mp4', codec=codec, audio_codec=audio_codec)
169
+ return "final_video_with_sound.mp4"
170
+
171
+ def infer(video_in, chosen_model):
172
+ image_in = extract_firstframe(video_in)
173
+ caption = get_caption(image_in)
174
+ if chosen_model == "MAGNet" :
175
+ audio_result = get_magnet(caption)
176
+ elif chosen_model == "AudioLDM-2" :
177
+ audio_result = get_audioldm(caption)
178
+ elif chosen_model == "AudioGen" :
179
+ audio_result = get_audiogen(caption)
180
+ elif chosen_model == "Tango" :
181
+ audio_result = get_tango(caption)
182
+
183
+ final_res = blend_vsfx(video_in, audio_result)
184
+ return gr.update(value=caption, interactive=True), gr.update(interactive=True), audio_result, final_res
185
+
186
+ def retry(edited_prompt, video_in, chosen_model):
187
+ image_in = extract_firstframe(video_in)
188
+ caption = edited_prompt
189
+ if chosen_model == "MAGNet" :
190
+ audio_result = get_magnet(caption)
191
+ elif chosen_model == "AudioLDM-2" :
192
+ audio_result = get_audioldm(caption)
193
+ elif chosen_model == "AudioGen" :
194
+ audio_result = get_audiogen(caption)
195
+ elif chosen_model == "Tango" :
196
+ audio_result = get_tango(caption)
197
+
198
+ final_res = blend_vsfx(video_in, audio_result)
199
+ return audio_result, final_res
200
+
201
+ def refresh():
202
+ return gr.update(value=None, interactive=False), gr.update(interactive=False), gr.update(value=None), gr.update(value=None)
203
+
204
+ css="""
205
+ #col-container{
206
+ margin: 0 auto;
207
+ max-width: 800px;
208
+ }
209
+ """
210
+
211
+ with gr.Blocks(css=css) as demo:
212
+ with gr.Column(elem_id="col-container"):
213
+ gr.HTML("""
214
+ <h2 style="text-align: center;">
215
+ Video to SoundFX
216
+ </h2>
217
+ <p style="text-align: center;">
218
+ Get sound effects from a video shot while comparing audio models from image caption.
219
+ </p>
220
+ """)
221
+
222
+ with gr.Row():
223
+
224
+ with gr.Column():
225
+ video_in = gr.Video(sources=["upload"], label="Video input")
226
+ with gr.Row():
227
+ chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango"], value="Tango")
228
+ submit_btn = gr.Button("Submit", scale=0)
229
+ with gr.Column():
230
+ caption_o = gr.Textbox(label="Scene caption", interactive=False)
231
+ retry_btn = gr.Button("Retry with edited scene caption", interactive=False)
232
+ audio_o = gr.Audio(label="Audio output")
233
+ with gr.Column():
234
+ video_o = gr.Video(label="Video with soundFX")
235
+
236
+ gr.Examples(
237
+ examples = [
238
+ ["examples/photoreal-train.mp4", "Tango"],
239
+ ["examples/train-window.mp4", "Tango"],
240
+ ["examples/chinese-new-year-dragon.mp4", "Tango"],
241
+ ["examples/big-sur.mp4", "AudioLDM-2"]
242
+ ],
243
+ fn=infer,
244
+ inputs = [video_in, chosen_model],
245
+ outputs= [caption_o, retry_btn, audio_o, video_o],
246
+ cache_examples=False
247
+ )
248
+
249
+ video_in.change(
250
+ fn = refresh,
251
+ inputs = None,
252
+ outputs = [caption_o, retry_btn, audio_o, video_o],
253
+ queue = False,
254
+ show_progress = False
255
+ )
256
+
257
+ video_in.clear(
258
+ fn = refresh,
259
+ inputs = None,
260
+ outputs = [caption_o, retry_btn, audio_o, video_o],
261
+ queue = False,
262
+ show_progress = False
263
+ )
264
+
265
+ submit_btn.click(
266
+ fn=infer,
267
+ inputs=[video_in, chosen_model],
268
+ outputs=[caption_o, retry_btn, audio_o, video_o],
269
+ concurrency_limit = 2
270
+ )
271
+
272
+ retry_btn.click(
273
+ fn=retry,
274
+ inputs=[caption_o, video_in, chosen_model],
275
+ outputs=[audio_o, video_o],
276
+ concurrency_limit = 2
277
+ )
278
+
279
+ demo.queue(max_size=10).launch(show_api=False, debug=True, show_error=True)
examples/big-sur.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a088ededf76628fc89e5c738e97ecbc50dc78832fffe3e675e5a937331d1e20e
3
+ size 17049093
examples/blank.md ADDED
File without changes
examples/chinese-new-year-dragon.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d686ae1b00e6deddbaa637c37d36c21571f9f0e6f6f38cd452ed3470e491a56
3
+ size 15220841
examples/photoreal-train.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a468dbb3d950fcc280b03223169d5c4b76425c9a226fa42b73a055694e156434
3
+ size 5889386
examples/train-window.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82addfe05e21760e1f76efae4345d7d4302d1a8b873dc746f885e139dd789ccb
3
+ size 18462003
oiseau.png ADDED
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ moviepy
2
+ opencv-python