|
import gradio as gr |
|
from transformers import pipeline |
|
import cv2 |
|
from PIL import Image |
|
import io |
|
|
|
|
|
def video_to_descriptions(video, target_language="en"): |
|
|
|
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") |
|
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer") |
|
|
|
|
|
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}") |
|
audio = pipeline("text-to-speech", model="suno/bark") |
|
|
|
|
|
cap = cv2.VideoCapture(video) |
|
fps = int(cap.get(cv2.CAP_PROP_FPS)) |
|
|
|
descriptions = [] |
|
frame_count = 0 |
|
|
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
if frame_count % (fps * 2) == 0: |
|
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
pil_img = Image.fromarray(frame_rgb) |
|
|
|
outputs = ImgToText(pil_img) |
|
description = outputs[0]['generated_text'] |
|
descriptions.append(description) |
|
print(str(frame_count) + " : " + outputs[0]['generated_text']) |
|
|
|
frame_count += 1 |
|
|
|
|
|
cap.release() |
|
|
|
|
|
concatenated_description = " ".join(descriptions) |
|
summarized_description = Summarize(concatenated_description, max_length=31)[0]["summary_text"] |
|
print("SUMMARIZATION : " + summarized_description) |
|
|
|
translated_text = translator(summarized_description)[0]["translation_text"] |
|
print("TRANSLATION : " + translated_text) |
|
|
|
print(audio(translated_text)) |
|
|
|
|
|
|
|
|
|
|
|
return translated_text |
|
|
|
|
|
language_dropdown = gr.Dropdown( |
|
["en", "fr", "de", "es"], label="Language", info="The Language of the output" |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=video_to_descriptions, |
|
inputs=[gr.Video(label="Import a Video", info="The Video to be described"), language_dropdown], |
|
|
|
outputs="text", |
|
live=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |