ManishThota commited on
Commit
ba1eb4b
1 Parent(s): ae7a212

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
2
+ import torch
3
+ import numpy as np
4
+ import av
5
+ import spaces
6
+ import gradio as gr
7
+ import os
8
+ import json
9
+
10
+ # Model Configuration
11
+ quantization_config = BitsAndBytesConfig(
12
+ load_in_4bit=True,
13
+ bnb_4bit_compute_dtype=torch.float16
14
+ )
15
+
16
+ model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
17
+
18
+ # Load Model and Processor
19
+ processor = LlavaNextVideoProcessor.from_pretrained(model_name)
20
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
21
+ model_name,
22
+ quantization_config=quantization_config,
23
+ device_map='auto'
24
+ )
25
+
26
+ @spaces.GPU
27
+ def read_video_pyav(container, indices):
28
+ '''
29
+ Decode the video with PyAV decoder.
30
+ '''
31
+ frames = []
32
+ container.seek(0)
33
+ start_index = indices[0]
34
+ end_index = indices[-1]
35
+ for i, frame in enumerate(container.decode(video=0)):
36
+ if i > end_index:
37
+ break
38
+ if i >= start_index and i in indices:
39
+ frames.append(frame)
40
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
41
+
42
+ @spaces.GPU
43
+ def process_video(video_file, question):
44
+ '''
45
+ Processes a single video and returns the answer to the given question.
46
+ '''
47
+ with av.open(video_file.name) as container:
48
+ total_frames = container.streams.video[0].frames
49
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
50
+ video_clip = read_video_pyav(container, indices)
51
+
52
+ conversation = [
53
+ {
54
+ "role": "user",
55
+ "content": [
56
+ {"type": "text", "text": f"{question}"},
57
+ {"type": "video"},
58
+ ],
59
+ },
60
+ ]
61
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
62
+ input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
63
+ generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
64
+ output = model.generate(**input, **generate_kwargs)
65
+ generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
66
+ return generated_text.split("ASSISTANT: ", 1)[-1].strip()
67
+
68
+ @spaces.GPU
69
+ def analyze_videos(video_files, selected_question):
70
+ """Analyzes all videos with the selected question."""
71
+ all_results = {}
72
+
73
+ questions = {
74
+ "hands_free": "Examine the subject’s right and left hands in the video to check if they are holding anything like a microphone, book, paper(White color), object, or any electronic device, try segmentations and decide if the hands are free or not.",
75
+ "standing/sitting": "Evaluate the subject’s body posture and movement within the video. Are they standing upright with both feet planted firmly on the ground? If so, they are standing. If they seem to be seated, they are seated.",
76
+ "interaction_with_background": "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen.",
77
+ "indoors/outdoors": "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
78
+ }
79
+
80
+ for video_file in video_files:
81
+ video_name = os.path.basename(video_file.name)
82
+ answer = process_video(video_file, questions[selected_question])
83
+ all_results[video_name] = {selected_question: answer}
84
+ return json.dumps(all_results, indent=4)
85
+
86
+ # Define Gradio interface
87
+ iface = gr.Interface(
88
+ fn=analyze_videos,
89
+ inputs=[
90
+ gr.File(label="Upload Videos", file_count="multiple"),
91
+ gr.Dropdown(["hands_free", "standing/sitting", "interaction_with_background", "indoors/outdoors"],
92
+ label="Select Question to Apply")
93
+ ],
94
+ outputs=gr.JSON(label="Analysis Results"),
95
+ title="Video Analysis",
96
+ description="Upload videos and select a question to analyze."
97
+ )
98
+
99
+ if __name__ == "__main__":
100
+ iface.launch(debug=True)