Spaces:
Runtime error
Runtime error
File size: 4,872 Bytes
e95a3a8 673f8e1 5bf65b0 e95a3a8 5bf65b0 b04464a e274fdd 673f8e1 e95a3a8 673f8e1 5bf65b0 e95a3a8 673f8e1 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 5bf65b0 e95a3a8 673f8e1 60e801c 673f8e1 60e801c 673f8e1 528611f 673f8e1 b04464a d377414 e274fdd d377414 8edb003 2f8463c 8edb003 e95a3a8 2f8463c 673f8e1 d377414 2f8463c 673f8e1 528611f 673f8e1 2f8463c 5bf65b0 2f8463c 5bf65b0 e95a3a8 2f8463c 5bf65b0 e95a3a8 2f8463c e274fdd 673f8e1 d377414 2f8463c 673f8e1 e95a3a8 bdeec31 c5be64b 66c23b6 bdeec31 66c23b6 c5be64b bdeec31 673f8e1 e95a3a8 673f8e1 e95a3a8 5bf65b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer, pipeline
from PIL import Image
from decord import VideoReader, cpu
import base64
import io
import spaces
import time
import os
from transformers.pipelines.audio_utils import ffmpeg_read
import moviepy.editor as mp
# Load models
model_path = 'openbmb/MiniCPM-V-2_6'
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
model = model.to(device='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.eval()
# Load Whisper model
whisper_model = "openai/whisper-large-v3"
asr_pipeline = pipeline(
task="automatic-speech-recognition",
model=whisper_model,
chunk_length_s=30,
device="cuda" if torch.cuda.is_available() else "cpu",
)
MAX_NUM_FRAMES = 64
def encode_image(image):
if not isinstance(image, Image.Image):
image = Image.open(image).convert("RGB")
max_size = 448*16
if max(image.size) > max_size:
w,h = image.size
if w > h:
new_w = max_size
new_h = int(h * max_size / w)
else:
new_h = max_size
new_w = int(w * max_size / h)
image = image.resize((new_w, new_h), resample=Image.BICUBIC)
return image
def encode_video(video_path):
vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1)
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if len(frame_idx) > MAX_NUM_FRAMES:
frame_idx = frame_idx[:MAX_NUM_FRAMES]
video = vr.get_batch(frame_idx).asnumpy()
video = [Image.fromarray(v.astype('uint8')) for v in video]
video = [encode_image(v) for v in video]
return video
def extract_audio(video_path):
import subprocess
audio_path = "temp_audio.wav"
subprocess.call(['ffmpeg', '-i', video_path, '-ab', '160k', '-ac', '2', '-ar', '44100', '-vn', audio_path])
return audio_path
def transcribe_audio(audio_file):
with open(audio_file, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, asr_pipeline.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": asr_pipeline.feature_extractor.sampling_rate}
transcription = asr_pipeline(inputs, batch_size=8, generate_kwargs={"task": "translate"}, return_timestamps=False)["text"]
return transcription
@spaces.GPU
def analyze_video(prompt, video, progress=gr.Progress()):
start_time = time.time()
progress(0, desc="Initializing")
if isinstance(video, str):
video_path = video
else:
video_path = video.name
progress(0.1, desc="Encoding video")
encoded_video = encode_video(video_path)
progress(0.3, desc="Extracting audio")
# Extract audio and transcribe
audio_path = extract_audio(video_path)
progress(0.5, desc="Transcribing audio")
transcription = transcribe_audio(audio_path)
print(f"Transcription: {transcription}")
# Clean up temporary audio file
os.remove(audio_path)
progress(0.7, desc="Preparing context")
context = [
{"role": "user", "content": encoded_video},
{"role": "assistant", "content": f"Transcription of the video: {transcription}"},
{"role": "user", "content": prompt}
]
params = {
'sampling': True,
'top_p': 0.8,
'top_k': 100,
'temperature': 0.7,
'repetition_penalty': 1.05,
"max_new_tokens": 2048,
"max_inp_length": 4352,
"use_image_id": False,
"max_slice_nums": 1 if len(encoded_video) > 16 else 2
}
progress(0.8, desc="Generating response")
response = model.chat(image=None, msgs=context, tokenizer=tokenizer, **params)
progress(0.9, desc="Finalizing")
end_time = time.time()
processing_time = end_time - start_time
analysis_result = f"Analysis Result:\n{response}\n\n"
processing_time = f"Processing Time: {processing_time:.2f} seconds"
progress(1, desc="Complete")
return analysis_result, processing_time
with gr.Blocks(theme="NoCrypt/miku") as demo:
gr.Label("Video Analyzer with MiniCPM-V-2_6 and Whisper")
with gr.Accordion("Input (Work best with English videos)"):
with gr.Row():
video_input = gr.Video(label="Upload Video")
prompt_input = gr.Textbox(label="Prompt", value="Analyze this video, give me advice on how to improve it and score from 0 to 100 for each point")
with gr.Accordion("Output"):
with gr.Row():
analysis_result = gr.Textbox(label="Analysis Result")
processing_time = gr.Textbox(label="Processing Time")
analyze_button = gr.Button("Analyze Video")
analyze_button.click(fn=analyze_video, inputs=[prompt_input, video_input], outputs=[analysis_result, processing_time])
demo.launch()
|