File size: 2,378 Bytes
44d9f1d
0b3abbe
0cad1a1
d4bf58f
 
ed6818e
0460dda
 
 
 
 
5bbf101
 
0cad1a1
0460dda
 
5c64ea3
0460dda
 
 
ed6818e
 
0cad1a1
 
ed6818e
0cad1a1
 
 
 
44d9f1d
0460dda
5bbf101
0460dda
0cad1a1
0460dda
d4bf58f
0460dda
5bbf101
0460dda
0cad1a1
ed6818e
0cad1a1
44d9f1d
0460dda
0cad1a1
44d9f1d
0460dda
0cad1a1
9a69098
0cad1a1
0460dda
 
5c64ea3
 
0460dda
5c64ea3
0460dda
 
5c64ea3
 
 
44d9f1d
 
0cad1a1
5c64ea3
 
81fc95f
44d9f1d
 
cb97970
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
from transformers import pipeline
import cv2
from PIL import Image
import io

# Import the translation pipeline
from transformers import pipeline as translation_pipeline

def video_to_descriptions(video, target_language="en"):
    # Load the image-to-text and summarization pipelines
    ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")
    
    # Load the translation pipeline for the target language
    translator = translation_pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
    audio = pipeline("text-to-speech", model="suno/bark")
    
    # Open the video
    cap = cv2.VideoCapture(video)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    descriptions = []
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Extract an image every 2 seconds
        if frame_count % (fps * 2) == 0:
            # Convert the image to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Convert the numpy array to a PIL image
            pil_img = Image.fromarray(frame_rgb)
            # Get the image description
            outputs = ImgToText(pil_img)
            description = outputs[0]['generated_text']
            descriptions.append(description)
        
        frame_count += 1

    # Close the video reader
    cap.release()

    # Concatenate the descriptions
    concatenated_descriptions = " ".join(descriptions)
    concatenated_descriptions = Summarize(concatenated_descriptions, max_length=(len(concatenated_descriptions) / 3))
    
    # Translate the summarized text into the target language
    translated_text = translator(concatenated_descriptions[0]["summarized-text"])[0]["translation_text"]

    audio_file = audio(translated_text)[0]["audio"]
    
    return audio_file

# Create a dropdown menu with language options
language_dropdown = gr.Dropdown(
            ["en", "fr", "de", "es"], label="Language", info="The Language of the output"
        )

iface = gr.Interface(
    fn=video_to_descriptions,
    inputs=[gr.Video(label="Import a Video", info="The Video to be described"), language_dropdown],
    outputs="audio",
    live=False
)

if __name__ == "__main__":
    iface.launch()