|
import gradio as gr |
|
from transformers import pipeline |
|
import cv2 |
|
from PIL import Image |
|
import io |
|
|
|
|
|
from transformers import pipeline as translation_pipeline |
|
|
|
def video_to_descriptions(video, target_language="en"): |
|
|
|
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") |
|
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer") |
|
|
|
|
|
translator = translation_pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}") |
|
audio = pipeline("text-to-speech", model="suno/bark") |
|
|
|
|
|
cap = cv2.VideoCapture(video) |
|
fps = int(cap.get(cv2.CAP_PROP_FPS)) |
|
|
|
descriptions = [] |
|
frame_count = 0 |
|
|
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
if frame_count % (fps * 2) == 0: |
|
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
pil_img = Image.fromarray(frame_rgb) |
|
|
|
outputs = ImgToText(pil_img) |
|
description = outputs[0]['generated_text'] |
|
descriptions.append(description) |
|
|
|
frame_count += 1 |
|
|
|
|
|
cap.release() |
|
|
|
|
|
concatenated_descriptions = " ".join(descriptions) |
|
concatenated_descriptions = Summarize(concatenated_descriptions, max_length=(len(concatenated_descriptions) / 3)) |
|
|
|
|
|
translated_text = translator(concatenated_descriptions[0]["summarized-text"])[0]["translation_text"] |
|
|
|
audio_file = audio(translated_text)[0]["audio"] |
|
|
|
return audio_file |
|
|
|
|
|
language_dropdown = gr.Dropdown( |
|
["en", "fr", "de", "es"], label="Language", info="The Language of the output" |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=video_to_descriptions, |
|
inputs=[gr.Video(label="Import a Video", info="The Video to be described"), language_dropdown], |
|
outputs="audio", |
|
live=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |