Spaces:

flocolombari
/

COLOMBARI_VIGNES-FERRINO_DERNIAUX_NIYONKURU

Runtime error

App Files Files Community

COLOMBARI_VIGNES-FERRINO_DERNIAUX_NIYONKURU / app.py

flocolombari

Update app.py

5c64ea3 about 1 year ago

raw

history blame

2.38 kB

	import gradio as gr
	from transformers import pipeline
	import cv2
	from PIL import Image
	import io

	# Import the translation pipeline
	from transformers import pipeline as translation_pipeline

	def video_to_descriptions(video, target_language="en"):
	# Load the image-to-text and summarization pipelines
	ImgToText = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
	Summarize = pipeline("summarization", model="tuner007/pegasus_summarizer")

	# Load the translation pipeline for the target language
	translator = translation_pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")
	audio = pipeline("text-to-speech", model="suno/bark")

	# Open the video
	cap = cv2.VideoCapture(video)
	fps = int(cap.get(cv2.CAP_PROP_FPS))

	descriptions = []
	frame_count = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	# Extract an image every 2 seconds
	if frame_count % (fps * 2) == 0:
	# Convert the image to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	# Convert the numpy array to a PIL image
	pil_img = Image.fromarray(frame_rgb)
	# Get the image description
	outputs = ImgToText(pil_img)
	description = outputs[0]['generated_text']
	descriptions.append(description)

	frame_count += 1

	# Close the video reader
	cap.release()

	# Concatenate the descriptions
	concatenated_descriptions = " ".join(descriptions)
	concatenated_descriptions = Summarize(concatenated_descriptions, max_length=(len(concatenated_descriptions) / 3))

	# Translate the summarized text into the target language
	translated_text = translator(concatenated_descriptions[0]["summarized-text"])[0]["translation_text"]

	audio_file = audio(translated_text)[0]["audio"]

	return audio_file

	# Create a dropdown menu with language options
	language_dropdown = gr.Dropdown(
	["en", "fr", "de", "es"], label="Language", info="The Language of the output"
	)

	iface = gr.Interface(
	fn=video_to_descriptions,
	inputs=[gr.Video(label="Import a Video", info="The Video to be described"), language_dropdown],
	outputs="audio",
	live=False
	)

	if __name__ == "__main__":
	iface.launch()