File size: 2,378 Bytes
44d9f1d 0b3abbe 0cad1a1 d4bf58f ed6818e 0460dda 5bbf101 0cad1a1 0460dda 5c64ea3 0460dda ed6818e 0cad1a1 ed6818e 0cad1a1 44d9f1d 0460dda 5bbf101 0460dda 0cad1a1 0460dda d4bf58f 0460dda 5bbf101 0460dda 0cad1a1 ed6818e 0cad1a1 44d9f1d 0460dda 0cad1a1 44d9f1d 0460dda 0cad1a1 9a69098 0cad1a1 0460dda 5c64ea3 0460dda 5c64ea3 0460dda 5c64ea3 44d9f1d 0cad1a1 5c64ea3 81fc95f 44d9f1d cb97970 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io
# Import the translation pipeline
from transformers import pipeline as translation_pipeline
def video_to_descriptions(video, target_language="en"):
# Load the image-to-text and summarization pipelines
ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
# Load the translation pipeline for the target language
translator = translation_pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
audio = pipeline("text-to-speech", model="suno/bark")
# Open the video
cap = cv2.VideoCapture(video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
descriptions = []
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# Extract an image every 2 seconds
if frame_count % (fps * 2) == 0:
# Convert the image to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Convert the numpy array to a PIL image
pil_img = Image.fromarray(frame_rgb)
# Get the image description
outputs = ImgToText(pil_img)
description = outputs[0]['generated_text']
descriptions.append(description)
frame_count += 1
# Close the video reader
cap.release()
# Concatenate the descriptions
concatenated_descriptions = " ".join(descriptions)
concatenated_descriptions = Summarize(concatenated_descriptions, max_length=(len(concatenated_descriptions) / 3))
# Translate the summarized text into the target language
translated_text = translator(concatenated_descriptions[0]["summarized-text"])[0]["translation_text"]
audio_file = audio(translated_text)[0]["audio"]
return audio_file
# Create a dropdown menu with language options
language_dropdown = gr.Dropdown(
["en", "fr", "de", "es"], label="Language", info="The Language of the output"
)
iface = gr.Interface(
fn=video_to_descriptions,
inputs=[gr.Video(label="Import a Video", info="The Video to be described"), language_dropdown],
outputs="audio",
live=False
)
if __name__ == "__main__":
iface.launch() |