Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import cv2 | |
from PIL import Image | |
import io | |
import scipy | |
Afficher plus | |
app.py | |
3 Ko | |
| |
import gradio as gr | |
from transformers import pipeline | |
import cv2 | |
from PIL import Image | |
import io | |
import scipy | |
import torch | |
import time | |
def video_to_descriptions(video, target_language="en"): | |
start_time = time.time() | |
print("START TIME = ", start_time) | |
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer") | |
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}") | |
audio = pipeline("text-to-speech", model="suno/bark-small") | |
voice_preset = f"v2/{target_language}_speaker_1" | |
cap = cv2.VideoCapture(video) | |
fps = int(cap.get(cv2.CAP_PROP_FPS)) | |
descriptions = [] | |
frame_count = 0 | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
if frame_count % (fps * 2) == 0: | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
pil_img = Image.fromarray(frame_rgb) | |
outputs = ImgToText(pil_img) | |
description = outputs[0]['generated_text'] | |
descriptions.append(description) | |
print(str(frame_count) + " : " + outputs[0]['generated_text']) | |
frame_count += 1 | |
cap.release() | |
concatenated_description = " ".join(descriptions) | |
summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"] | |
print("SUMMARIZATION : " + summarized_description) | |
translated_text = translator(summarized_description)[0]["translation_text"] | |
print("TRANSLATION : " + translated_text) | |
audio_file = audio(translated_text) | |
output_path = "./bark_out.wav" | |
scipy.io.wavfile.write(output_path, data=audio_file["audio"][0], rate=audio_file["sampling_rate"]) | |
stop_time = time.time() | |
print("EXECUTION TIME = ", stop_time - start_time) | |
return output_path | |
language_dropdown = gr.Dropdown( | |
["en", "fr", "de", "es"], label="[MANDATORY] Language", info="The Voice's Language" | |
) | |
iface = gr.Interface( | |
fn=video_to_descriptions, | |
inputs=[gr.Video(label="Video to Upload", info="The Video"), language_dropdown], | |
outputs="audio", | |
live=False | |
) | |
if __name__ == "__main__": | |
iface.launch() |