ManishThota commited on
Commit
ecb85c3
1 Parent(s): c2bb44f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
2
+ import torch
3
+ import numpy as np
4
+ import av
5
+
6
+
7
+ quantization_config = BitsAndBytesConfig(
8
+ load_in_4bit=True,
9
+ bnb_4bit_compute_dtype=torch.float16
10
+ )
11
+
12
+ model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
13
+
14
+ processor = LlavaNextVideoProcessor.from_pretrained(model_name)
15
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
16
+ model_name,
17
+ quantization_config=quantization_config,
18
+ device_map='auto'
19
+ )
20
+
21
+
22
+ def read_video_pyav(container, indices):
23
+ '''
24
+ Decode the video with PyAV decoder.
25
+
26
+ Args:
27
+ container (av.container.input.InputContainer): PyAV container.
28
+ indices (List[int]): List of frame indices to decode.
29
+
30
+ Returns:
31
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
32
+ '''
33
+ frames = []
34
+ container.seek(0)
35
+ start_index = indices[0]
36
+ end_index = indices[-1]
37
+ for i, frame in enumerate(container.decode(video=0)):
38
+ if i > end_index:
39
+ break
40
+ if i >= start_index and i in indices:
41
+ frames.append(frame)
42
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
43
+
44
+ def process_video(video_file, question):
45
+ # Open video and sample frames
46
+ with av.open(video_file) as container:
47
+ total_frames = container.streams.video[0].frames
48
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
49
+ video_clip = read_video_pyav(container, indices)
50
+
51
+ # Prepare conversation
52
+ conversation = [
53
+ {
54
+ "role": "user",
55
+ "content": [
56
+ {"type": "text", "text": f"{question}"},
57
+ {"type": "video"},
58
+ ],
59
+ },
60
+ ]
61
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
62
+ # Prepare inputs for the model
63
+ input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
64
+
65
+ # Generate output
66
+ generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
67
+ output = model.generate(**input, **generate_kwargs)
68
+ generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
69
+
70
+ return generated_text.split("ASSISTANT: ", 1)[-1].strip()
71
+
72
+ # Define Gradio interface
73
+ def gradio_interface(video, question):
74
+ return process_video(video, question)
75
+
76
+
77
+
78
+ iface = gr.Interface(
79
+ fn=gradio_interface,
80
+ inputs=[
81
+ gr.Video(label="Upload Video"),
82
+ gr.Textbox(label="Enter Question")
83
+ ],
84
+ outputs=gr.Textbox(label="Generated Answer"),
85
+ title="Video Question Answering",
86
+ description="Upload a video and enter a question to get a generated text response."
87
+ )
88
+
89
+ if __name__ == "__main__":
90
+ iface.launch(debug=True)
91
+