Spaces:

metek7
/

instagram-short-summarizing

Runtime error

App Files Files Community

instagram-short-summarizing / app.py

metek7

Update app.py

40bd94b verified 9 days ago

raw

history blame contribute delete

4.97 kB

	import gradio as gr
	import subprocess
	from deep_translator import GoogleTranslator
	import torch
	from llava.model.builder import load_pretrained_model
	from llava.mm_utils import tokenizer_image_token
	from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
	from llava.conversation import conv_templates
	from decord import VideoReader, cpu
	import numpy as np
	import copy

	# Gerekli kütüphanelerin kurulumu
	subprocess.run(
	"pip install flash-attn --no-build-isolation",
	env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	shell=True,
	)
	subprocess.run("pip install deep_translator", shell=True)

	# Çevirmen nesnesi oluştur
	translator = GoogleTranslator(source='tr', target='en')
	translator_reverse = GoogleTranslator(source='en', target='tr')

	title = "# 🙋🏻‍♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
	description1 = """🌋📹LLaVA-Video-7B-Qwen2, ...
	"""
	description2 = """
	...
	"""

	join_us = """
	## Bize Katılın:
	...
	"""

	def load_video(video_path, max_frames_num, fps=1, force_sample=False):
	if max_frames_num == 0:
	return np.zeros((1, 336, 336, 3))

	vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
	total_frame_num = len(vr)
	fps = round(vr.get_avg_fps()/fps)
	frame_idx = [i for i in range(0, len(vr), fps)]
	frame_time = [i/vr.get_avg_fps() for i in frame_idx]

	if len(frame_idx) > max_frames_num or force_sample:
	sample_fps = max_frames_num
	uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
	frame_idx = uniform_sampled_frames.tolist()
	frame_time = [i/vr.get_avg_fps() for i in frame_idx]

	frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
	spare_frames = vr.get_batch(frame_idx).asnumpy()

	return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()

	# Model yükleme
	pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
	model_name = "llava_qwen"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	device_map = "auto"

	print("Model yükleniyor...")
	tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
	model.eval()
	print("Model başarıyla yüklendi!")

	def process_video(video_path, question):
	try:
	max_frames_num = 64
	video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
	video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
	video = [video]

	conv_template = "qwen_1_5"
	time_instruction = f"Video {video_time:.2f} saniye sürmektedir ve {len(video[0])} kare uniform olarak örneklenmiştir. Bu kareler {frame_time} konumlarında bulunmaktadır. Lütfen bu videoyla ilgili aşağıdaki soruları cevaplayın."

	# Soruyu İngilizce'ye çevir
	question_en = translator.translate(question)
	full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question_en}"

	conv = copy.deepcopy(conv_templates[conv_template])
	conv.append_message(conv.roles[0], full_question)
	conv.append_message(conv.roles[1], None)
	prompt_question = conv.get_prompt()

	input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)

	with torch.no_grad():
	output = model.generate(
	input_ids,
	images=video,
	modalities=["video"],
	do_sample=False,
	temperature=0,
	max_new_tokens=4096,
	)

	response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()

	# Cevabı Türkçe'ye çevir
	response_tr = translator_reverse.translate(response)
	return response_tr
	except Exception as e:
	return f"Bir hata oluştu: {str(e)}"

	def gradio_interface(video_file, question):
	if video_file is None:
	return "Lütfen bir video dosyası yükleyin."
	response = process_video(video_file, question)
	return response

	with gr.Blocks() as demo:
	gr.Markdown(title)
	with gr.Row():
	with gr.Group():
	gr.Markdown(description1)
	with gr.Group():
	gr.Markdown(description2)
	with gr.Accordion("Bize Katılın", open=False):
	gr.Markdown(join_us)
	with gr.Row():
	with gr.Column():
	video_input = gr.Video()
	question_input = gr.Textbox(label="🙋🏻‍♂️Kullanıcı Sorusu", placeholder="Video hakkında bir soru sorun...")
	submit_button = gr.Button("🌋📹LLaVA-Video'ya Sor")
	output = gr.Textbox(label="🌋📹LLaVA-Video")

	submit_button.click(
	fn=gradio_interface,
	inputs=[video_input, question_input],
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch(show_error=True)