ManishThota commited on
Commit
9178374
1 Parent(s): df2ba9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -10
app.py CHANGED
@@ -10,13 +10,15 @@ import json
10
  import csv
11
  import io
12
 
 
13
  quantization_config = BitsAndBytesConfig(
14
  load_in_4bit=True,
15
  bnb_4bit_compute_dtype=torch.float16
16
  )
17
 
18
- model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
19
 
 
20
  processor = LlavaNextVideoProcessor.from_pretrained(model_name)
21
  model = LlavaNextVideoForConditionalGeneration.from_pretrained(
22
  model_name,
@@ -25,7 +27,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
25
  )
26
 
27
 
28
- @spaces.GPU
29
  def read_video_pyav(container, indices):
30
  '''
31
  Decode the video with PyAV decoder.
@@ -63,18 +64,23 @@ def process_video(video_file, question):
63
  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
64
  input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
65
  generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
66
- output = model.generate(**input, **generate_kwargs)
 
 
 
 
67
  generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
68
  return generated_text.split("ASSISTANT: ", 1)[-1].strip()
 
69
  @spaces.GPU
70
  def analyze_videos(video_files, selected_questions):
71
  """Analyzes videos, saves results to CSV, and returns CSV data and JSON."""
72
  all_results = {}
73
  questions = {
74
- "hands_free": "Examine the subjects right and left hands in the video to check if they are holding anything like a microphone, book, paper(White color), object, or any electronic device, try segmentations and decide if the hands are free or not.",
75
- "standing/sitting": "Evaluate the subject’s body posture and movement within the video. Are they standing upright with both feet planted firmly on the ground? If so, they are standing. If they seem to be seated, they are seated.",
76
  "interaction_with_background": "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen.",
77
- "indoors/outdoors": "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
78
  }
79
 
80
  for video_file in video_files:
@@ -84,7 +90,7 @@ def analyze_videos(video_files, selected_questions):
84
  answer = process_video(video_file, questions[question_key])
85
  all_results[video_name][question_key] = "true" if "yes" in answer.lower() else "false"
86
 
87
- del answer
88
  gc.collect()
89
  torch.cuda.empty_cache()
90
 
@@ -102,7 +108,6 @@ def analyze_videos(video_files, selected_questions):
102
  json_output = json.dumps(all_results, indent=4)
103
  return json_output, csv_content
104
 
105
-
106
  def download_csv(csv_content):
107
  """Creates a downloadable CSV file."""
108
  return gr.File.update(
@@ -114,10 +119,10 @@ def download_csv(csv_content):
114
  with gr.Blocks() as iface:
115
  with gr.Row():
116
  file_input = gr.File(label="Upload Videos", file_count="multiple")
117
- question_input = gr.CheckboxGroup(["hands_free", "standing/sitting", "interaction_with_background", "indoors/outdoors"],
118
  label="Select Questions to Apply")
119
 
120
- process_button = gr.Button("Process Videos") # Process button below checkboxes
121
 
122
  with gr.Row():
123
  json_output = gr.JSON(label="Analysis Results (JSON)")
 
10
  import csv
11
  import io
12
 
13
+ # Model Configuration
14
  quantization_config = BitsAndBytesConfig(
15
  load_in_4bit=True,
16
  bnb_4bit_compute_dtype=torch.float16
17
  )
18
 
19
+ model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
20
 
21
+ # Load Model and Processor
22
  processor = LlavaNextVideoProcessor.from_pretrained(model_name)
23
  model = LlavaNextVideoForConditionalGeneration.from_pretrained(
24
  model_name,
 
27
  )
28
 
29
 
 
30
  def read_video_pyav(container, indices):
31
  '''
32
  Decode the video with PyAV decoder.
 
64
  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
65
  input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
66
  generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
67
+
68
+ # Disable gradient calculation during inference
69
+ with torch.no_grad():
70
+ output = model.generate(**input, **generate_kwargs)
71
+
72
  generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
73
  return generated_text.split("ASSISTANT: ", 1)[-1].strip()
74
+
75
  @spaces.GPU
76
  def analyze_videos(video_files, selected_questions):
77
  """Analyzes videos, saves results to CSV, and returns CSV data and JSON."""
78
  all_results = {}
79
  questions = {
80
+ "hands_free": "Is the subject's hand in the video free or not?",
81
+ "standing": "Is the subject in the video sitting or standing?",
82
  "interaction_with_background": "Assess the surroundings behind the subject in the video. Do they seem to interact with any visible screens, such as laptops, TVs, or digital billboards? If yes, then they are interacting with a screen. If not, they are not interacting with a screen.",
83
+ "indoors": "Consider the broader environmental context shown in the video’s background. Are there signs of an open-air space, like greenery, structures, or people passing by? If so, it’s an outdoor setting. If the setting looks confined with furniture, walls, or home decorations, it’s an indoor environment."
84
  }
85
 
86
  for video_file in video_files:
 
90
  answer = process_video(video_file, questions[question_key])
91
  all_results[video_name][question_key] = "true" if "yes" in answer.lower() else "false"
92
 
93
+ # Clear cache and collect garbage after each video
94
  gc.collect()
95
  torch.cuda.empty_cache()
96
 
 
108
  json_output = json.dumps(all_results, indent=4)
109
  return json_output, csv_content
110
 
 
111
  def download_csv(csv_content):
112
  """Creates a downloadable CSV file."""
113
  return gr.File.update(
 
119
  with gr.Blocks() as iface:
120
  with gr.Row():
121
  file_input = gr.File(label="Upload Videos", file_count="multiple")
122
+ question_input = gr.CheckboxGroup(["hands_free", "standing", "interaction_with_background", "indoors"],
123
  label="Select Questions to Apply")
124
 
125
+ process_button = gr.Button("Process Videos")
126
 
127
  with gr.Row():
128
  json_output = gr.JSON(label="Analysis Results (JSON)")