|
import gradio as gr
|
|
from gradio_client import Client
|
|
import os
|
|
import json
|
|
import re
|
|
from moviepy.editor import *
|
|
import cv2
|
|
|
|
hf_token = os.environ.get("HF_TKN")
|
|
|
|
def extract_firstframe(video_in):
|
|
vidcap = cv2.VideoCapture(video_in)
|
|
success,image = vidcap.read()
|
|
count = 0
|
|
while success:
|
|
if count == 0:
|
|
cv2.imwrite("first_frame.jpg", image)
|
|
else:
|
|
break
|
|
success,image = vidcap.read()
|
|
print ('Read a new frame: ', success)
|
|
count += 1
|
|
print ("Done extracted first frame!")
|
|
return "first_frame.jpg"
|
|
|
|
def extract_audio(video_in):
|
|
input_video = video_in
|
|
output_audio = 'audio.wav'
|
|
|
|
|
|
video_clip = VideoFileClip(input_video)
|
|
audio_clip = video_clip.audio
|
|
|
|
|
|
audio_clip.write_audiofile(output_audio, fps=44100)
|
|
print("Audio extraction complete.")
|
|
|
|
return 'audio.wav'
|
|
|
|
def get_caption_from_kosmos(image_in):
|
|
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
|
|
|
|
kosmos2_result = kosmos2_client.predict(
|
|
image_in,
|
|
"Detailed",
|
|
fn_index=4
|
|
)
|
|
|
|
print(f"KOSMOS2 RETURNS: {kosmos2_result}")
|
|
|
|
with open(kosmos2_result[1], 'r') as f:
|
|
data = json.load(f)
|
|
|
|
reconstructed_sentence = []
|
|
for sublist in data:
|
|
reconstructed_sentence.append(sublist[0])
|
|
|
|
full_sentence = ' '.join(reconstructed_sentence)
|
|
|
|
|
|
|
|
pattern = r'^Describe this image in detail:\s*(.*)$'
|
|
|
|
match = re.search(pattern, full_sentence)
|
|
if match:
|
|
description = match.group(1)
|
|
print(description)
|
|
else:
|
|
print("Unable to locate valid description.")
|
|
|
|
|
|
last_period_index = description.rfind('.')
|
|
|
|
|
|
truncated_caption = description[:last_period_index + 1]
|
|
|
|
|
|
print(f"\nβ\nIMAGE CAPTION: {truncated_caption}")
|
|
|
|
return truncated_caption
|
|
|
|
def get_caption(image_in):
|
|
client = Client("https://fffiloni-moondream1.hf.space/", hf_token=hf_token)
|
|
result = client.predict(
|
|
image_in,
|
|
"Describe precisely the image in one sentence.",
|
|
|
|
api_name="/predict"
|
|
)
|
|
print(result)
|
|
return result
|
|
|
|
def get_magnet(prompt):
|
|
amended_prompt = f"{prompt}"
|
|
print(amended_prompt)
|
|
client = Client("https://fffiloni-magnet.hf.space/")
|
|
result = client.predict(
|
|
"facebook/audio-magnet-medium",
|
|
"",
|
|
amended_prompt,
|
|
3,
|
|
0.9,
|
|
10,
|
|
1,
|
|
20,
|
|
10,
|
|
10,
|
|
10,
|
|
"prod-stride1 (new!)",
|
|
api_name="/predict_full"
|
|
)
|
|
print(result)
|
|
return result[1]
|
|
|
|
def get_audioldm(prompt):
|
|
client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
|
|
result = client.predict(
|
|
prompt,
|
|
"Low quality. Music.",
|
|
10,
|
|
3.5,
|
|
45,
|
|
3,
|
|
fn_index=1
|
|
)
|
|
print(result)
|
|
audio_result = extract_audio(result)
|
|
return audio_result
|
|
|
|
def get_audiogen(prompt):
|
|
client = Client("https://fffiloni-audiogen.hf.space/")
|
|
result = client.predict(
|
|
prompt,
|
|
10,
|
|
api_name="/infer"
|
|
)
|
|
return result
|
|
|
|
def get_tango(prompt):
|
|
try:
|
|
|
|
client = Client("https://fffiloni-tango.hf.space/", hf_token=hf_token)
|
|
except:
|
|
raise gr.Error("Tango space API is not ready, please try again in few minutes ")
|
|
|
|
result = client.predict(
|
|
prompt,
|
|
100,
|
|
4,
|
|
api_name="/predict"
|
|
)
|
|
print(result)
|
|
return result
|
|
|
|
def blend_vsfx(video_in, audio_result):
|
|
audioClip = AudioFileClip(audio_result)
|
|
print(f"AUD: {audioClip.duration}")
|
|
clip = VideoFileClip(video_in)
|
|
print(f"VID: {clip.duration}")
|
|
if clip.duration < audioClip.duration :
|
|
audioClip = audioClip.subclip((0.0), (clip.duration))
|
|
elif clip.duration > audioClip.duration :
|
|
clip = clip.subclip((0.0), (audioClip.duration))
|
|
final_clip = clip.set_audio(audioClip)
|
|
|
|
codec = 'libx264'
|
|
audio_codec = 'aac'
|
|
final_clip.write_videofile('final_video_with_sound.mp4', codec=codec, audio_codec=audio_codec)
|
|
return "final_video_with_sound.mp4"
|
|
|
|
def infer(video_in, chosen_model):
|
|
image_in = extract_firstframe(video_in)
|
|
caption = get_caption(image_in)
|
|
if chosen_model == "MAGNet" :
|
|
audio_result = get_magnet(caption)
|
|
elif chosen_model == "AudioLDM-2" :
|
|
audio_result = get_audioldm(caption)
|
|
elif chosen_model == "AudioGen" :
|
|
audio_result = get_audiogen(caption)
|
|
elif chosen_model == "Tango" :
|
|
audio_result = get_tango(caption)
|
|
|
|
final_res = blend_vsfx(video_in, audio_result)
|
|
return gr.update(value=caption, interactive=True), gr.update(interactive=True), audio_result, final_res
|
|
|
|
def retry(edited_prompt, video_in, chosen_model):
|
|
image_in = extract_firstframe(video_in)
|
|
caption = edited_prompt
|
|
if chosen_model == "MAGNet" :
|
|
audio_result = get_magnet(caption)
|
|
elif chosen_model == "AudioLDM-2" :
|
|
audio_result = get_audioldm(caption)
|
|
elif chosen_model == "AudioGen" :
|
|
audio_result = get_audiogen(caption)
|
|
elif chosen_model == "Tango" :
|
|
audio_result = get_tango(caption)
|
|
|
|
final_res = blend_vsfx(video_in, audio_result)
|
|
return audio_result, final_res
|
|
|
|
def refresh():
|
|
return gr.update(value=None, interactive=False), gr.update(interactive=False), gr.update(value=None), gr.update(value=None)
|
|
|
|
css="""
|
|
#col-container{
|
|
margin: 0 auto;
|
|
max-width: 800px;
|
|
}
|
|
"""
|
|
|
|
with gr.Blocks(css=css) as demo:
|
|
with gr.Column(elem_id="col-container"):
|
|
gr.HTML("""
|
|
<h2 style="text-align: center;">
|
|
Video to SoundFX
|
|
</h2>
|
|
<p style="text-align: center;">
|
|
Get sound effects from a video shot while comparing audio models from image caption.
|
|
</p>
|
|
""")
|
|
|
|
with gr.Row():
|
|
|
|
with gr.Column():
|
|
video_in = gr.Video(sources=["upload"], label="Video input")
|
|
with gr.Row():
|
|
chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango"], value="Tango")
|
|
submit_btn = gr.Button("Submit", scale=0)
|
|
with gr.Column():
|
|
caption_o = gr.Textbox(label="Scene caption", interactive=False)
|
|
retry_btn = gr.Button("Retry with edited scene caption", interactive=False)
|
|
audio_o = gr.Audio(label="Audio output")
|
|
with gr.Column():
|
|
video_o = gr.Video(label="Video with soundFX")
|
|
|
|
gr.Examples(
|
|
examples = [
|
|
["examples/photoreal-train.mp4", "Tango"],
|
|
["examples/train-window.mp4", "Tango"],
|
|
["examples/chinese-new-year-dragon.mp4", "Tango"],
|
|
["examples/big-sur.mp4", "AudioLDM-2"]
|
|
],
|
|
fn=infer,
|
|
inputs = [video_in, chosen_model],
|
|
outputs= [caption_o, retry_btn, audio_o, video_o],
|
|
cache_examples=False
|
|
)
|
|
|
|
video_in.change(
|
|
fn = refresh,
|
|
inputs = None,
|
|
outputs = [caption_o, retry_btn, audio_o, video_o],
|
|
queue = False,
|
|
show_progress = False
|
|
)
|
|
|
|
video_in.clear(
|
|
fn = refresh,
|
|
inputs = None,
|
|
outputs = [caption_o, retry_btn, audio_o, video_o],
|
|
queue = False,
|
|
show_progress = False
|
|
)
|
|
|
|
submit_btn.click(
|
|
fn=infer,
|
|
inputs=[video_in, chosen_model],
|
|
outputs=[caption_o, retry_btn, audio_o, video_o],
|
|
concurrency_limit = 2
|
|
)
|
|
|
|
retry_btn.click(
|
|
fn=retry,
|
|
inputs=[caption_o, video_in, chosen_model],
|
|
outputs=[audio_o, video_o],
|
|
concurrency_limit = 2
|
|
)
|
|
|
|
demo.queue(max_size=10).launch(show_api=False, debug=True, show_error=True) |