Aliayub1995 commited on
Commit
e30d941
·
verified ·
1 Parent(s): 582e00e

Upload 56 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. README.md +13 -0
  3. examples/demo2.mp4 +3 -0
  4. examples/demo3.mp4 +3 -0
  5. examples/desert.jpg +0 -0
  6. examples/extreme_ironing.jpg +0 -0
  7. examples/rap.mp4 +3 -0
  8. examples/sora.png +3 -0
  9. examples/waterview.jpg +0 -0
  10. handler.py +61 -0
  11. pyproject.toml +39 -0
  12. requirements.txt +37 -0
  13. test_handler.py +29 -0
  14. videollama2/__init__.py +1 -0
  15. videollama2/constants.py +38 -0
  16. videollama2/conversation.py +484 -0
  17. videollama2/eval/eval_benchmark_1_correctness.py +210 -0
  18. videollama2/eval/eval_benchmark_2_detailed_orientation.py +210 -0
  19. videollama2/eval/eval_benchmark_3_context.py +212 -0
  20. videollama2/eval/eval_benchmark_4_temporal.py +206 -0
  21. videollama2/eval/eval_benchmark_5_consistency.py +218 -0
  22. videollama2/eval/eval_video_qa_gpt.py +219 -0
  23. videollama2/eval/eval_video_qa_mvbench.py +64 -0
  24. videollama2/eval/run_inference_video_qa_batch.py +563 -0
  25. videollama2/eval/run_inference_video_qa_gpt.py +151 -0
  26. videollama2/eval/run_inference_video_qa_gpt_consistency.py +182 -0
  27. videollama2/eval/run_inference_video_qa_gpt_general.py +177 -0
  28. videollama2/eval/run_inference_video_qa_perception_test_mcqa.py +214 -0
  29. videollama2/mm_utils.py +538 -0
  30. videollama2/model/__init__.py +3 -0
  31. videollama2/model/builder.py +170 -0
  32. videollama2/model/language_model/videollama2_llama.py +147 -0
  33. videollama2/model/language_model/videollama2_mistral.py +149 -0
  34. videollama2/model/language_model/videollama2_mixtral.py +149 -0
  35. videollama2/model/multimodal_encoder/builder.py +15 -0
  36. videollama2/model/multimodal_encoder/clip_encoder.py +84 -0
  37. videollama2/model/multimodal_projector/__init__.py +1 -0
  38. videollama2/model/multimodal_projector/builder.py +250 -0
  39. videollama2/model/videollama2_arch.py +346 -0
  40. videollama2/serve/cli.py +144 -0
  41. videollama2/serve/controller.py +298 -0
  42. videollama2/serve/examples/1034346401.mp4 +3 -0
  43. videollama2/serve/examples/desert.jpg +0 -0
  44. videollama2/serve/examples/extreme_ironing.jpg +0 -0
  45. videollama2/serve/examples/sample_demo_1.mp4 +3 -0
  46. videollama2/serve/examples/sample_demo_3.mp4 +0 -0
  47. videollama2/serve/examples/sample_demo_9.mp4 +0 -0
  48. videollama2/serve/examples/waterview.jpg +0 -0
  49. videollama2/serve/gradio_web_server.py +503 -0
  50. videollama2/serve/model_worker.py +397 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/demo2.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ examples/demo3.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ examples/rap.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ examples/sora.png filter=lfs diff=lfs merge=lfs -text
40
+ videollama2/serve/examples/1034346401.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ videollama2/serve/examples/sample_demo_1.mp4 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VideoLLaMA2
3
+ emoji: 🎥📸💬
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
examples/demo2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aec475bc837a1372f0b1c9ccea2c0c293f8d90f3f381f68f0691964d6d48fdca
3
+ size 3292167
examples/demo3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8783c215899aea0324b74bc4254b105bc2aa1759080dca0eb8166b2405e8cd5
3
+ size 4527999
examples/desert.jpg ADDED
examples/extreme_ironing.jpg ADDED
examples/rap.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde3c54b5e9aba7cf77c6530684a4bba45f661dd1ab664043375bf9582196200
3
+ size 13779546
examples/sora.png ADDED

Git LFS Details

  • SHA256: 8b11fb8f1214cfb0cae48a29373616ab8a4916252c4aacc43f1727ea4e628dc3
  • Pointer size: 132 Bytes
  • Size of remote file: 2.61 MB
examples/waterview.jpg ADDED
handler.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import torch
3
+ from transformers import pipeline
4
+ from videollama2.conversation import conv_templates, SeparatorStyle
5
+ from videollama2.constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX
6
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video, process_image
7
+ from videollama2.model.builder import load_pretrained_model
8
+
9
+ class EndpointHandler():
10
+ def __init__(self, path="DAMO-NLP-SG/VideoLLaMA2-8x7B"):
11
+ model_name = get_model_name_from_path(path)
12
+ self.tokenizer, self.model, self.processor, self.context_len = load_pretrained_model(path, None, model_name)
13
+ self.model = self.model.to('cuda:0')
14
+
15
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
16
+ # get inputs
17
+ paths = data.get("paths", [])
18
+ questions = data.get("questions", [])
19
+ modal_list = data.get("modal_list", [])
20
+
21
+ # check if modal_list and paths are provided
22
+ if not paths or not modal_list:
23
+ return [{"error": "Missing paths or modal_list"}]
24
+
25
+ # Visual preprocess (load & transform image or video)
26
+ if modal_list[0] == 'video':
27
+ tensor = process_video(paths[0], self.processor, self.model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True)
28
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
29
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
30
+ else:
31
+ tensor = process_image(paths[0], self.processor, self.model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True)
32
+ default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"]
33
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
34
+ tensor = [tensor]
35
+
36
+ # Text preprocess (tag process & generate prompt)
37
+ question = default_mm_token + "\n" + questions[0]
38
+ conv_mode = 'llama_2'
39
+ conv = conv_templates[conv_mode].copy()
40
+ conv.append_message(conv.roles[0], question)
41
+ conv.append_message(conv.roles[1], None)
42
+ prompt = conv.get_prompt()
43
+ input_ids = tokenizer_MMODAL_token(prompt, self.tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0')
44
+
45
+ # Generate a response according to visual signals and prompts
46
+ stop_str = conv.sep if conv.sep_style in [SeparatorStyle.SINGLE] else conv.sep2
47
+ keywords = [stop_str]
48
+ stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
49
+ with torch.inference_mode():
50
+ output_ids = self.model.generate(
51
+ input_ids,
52
+ images_or_videos=tensor,
53
+ modal_list=modal_list,
54
+ do_sample=True,
55
+ temperature=0.2,
56
+ max_new_tokens=1024,
57
+ use_cache=True,
58
+ stopping_criteria=[stopping_criteria],
59
+ )
60
+ outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
61
+ return [{"output": outputs[0]}]
pyproject.toml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "videollama2"
7
+ version = "1.0"
8
+ description = "Release of VideoLLaMA2"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: Apache Software License",
14
+ ]
15
+ dependencies = [
16
+ "torch>=2.0.1", "torchvision>=0.15.2",
17
+ "tokenizers==0.15.1", "sentencepiece==0.1.99",
18
+ "transformers==4.37.2", "accelerate==0.21.0",
19
+ "deepspeed==0.13.1", "peft==0.4.0", "shortuuid",
20
+ "decord==0.6.0", "pytorchvideo==0.1.5", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
21
+ "moviepy==1.0.3", "scenedetect==0.6.3", "numpy", "scikit-learn==1.2.2",
22
+ "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
23
+ "bitsandbytes==0.41.0", "pydantic<2,>=1", "markdown2[all]",
24
+ "gradio==3.35.2", "gradio_client==0.2.9", opencv-python,
25
+ "requests", "httpx==0.24.0", "openai", "uvicorn", "fastapi", "wandb"
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ train = ["ninja"]
30
+
31
+ [project.urls]
32
+ "Homepage" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2"
33
+ "Bug Tracker" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues"
34
+
35
+ [tool.setuptools.packages.find]
36
+ exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
37
+
38
+ [tool.wheel]
39
+ exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu117
2
+ # basic dependencies
3
+ torchvision==0.15.2
4
+ torch==2.0.1
5
+ transformers==4.37.2
6
+ tokenizers==0.15.1
7
+ deepspeed==0.13.1
8
+ accelerate
9
+ timm
10
+ numpy
11
+ spaces
12
+ peft==0.4.0
13
+ # data processing
14
+ decord==0.6.0
15
+ imageio==2.34.0
16
+ imageio-ffmpeg==0.4.9
17
+ pytorchvideo==0.1.5
18
+ moviepy==1.0.3
19
+ scenedetect==0.6.3
20
+ opencv-python==4.7.0.72
21
+ # misc
22
+ scikit-learn==1.2.2
23
+ huggingface_hub==0.22.2
24
+ sentencepiece==0.1.99
25
+ shortuuid
26
+ einops==0.6.1
27
+ einops-exts==0.0.4
28
+ bitsandbytes==0.41.0
29
+ pydantic<2,>=1
30
+ markdown2[all]
31
+ gradio==3.35.2
32
+ gradio_client==0.2.9
33
+ requests
34
+ httpx==0.24.0
35
+ uvicorn
36
+ fastapi
37
+ wandb
test_handler.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from handler import EndpointHandler
2
+
3
+ # Initialize handler
4
+ my_handler = EndpointHandler(path="DAMO-NLP-SG/VideoLLaMA2-8x7B")
5
+
6
+ # Prepare sample payloads
7
+ video_payload = {
8
+ "paths": ["assets/cat_and_chicken.mp4"],
9
+ "questions": ["What animals are in the video, what are they doing, and how does the video feel?"],
10
+ "modal_list": ["video"]
11
+ }
12
+
13
+ image_payload = {
14
+ "paths": ["assets/sora.png"],
15
+ "questions": ["What is the woman wearing, what is she doing, and how does the image feel?"],
16
+ "modal_list": ["image"]
17
+ }
18
+
19
+ # Test the handler
20
+ video_pred = my_handler(video_payload)
21
+ image_pred = my_handler(image_payload)
22
+
23
+ # Show results
24
+ print("video_pred", video_pred)
25
+ print("image_pred", image_pred)
26
+
27
+ # Expected Output Examples:
28
+ # video_pred [{'output': 'The video features a kitten and a baby chick playing together. The kitten is seen laying on the floor while the baby chick hops around. The two animals interact playfully with each other, and the video has a cute and heartwarming feel to it.'}]
29
+ # image_pred [{'output': 'The woman in the image is wearing a black coat and sunglasses, and she is walking down a rain-soaked city street. The image feels vibrant and lively, with the bright city lights reflecting off the wet pavement, creating a visually appealing atmosphere.'}]
videollama2/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model import Videollama2LlamaForCausalLM, Videollama2MistralForCausalLM
videollama2/constants.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "./log_dir"
5
+
6
+ NUM_FRAMES = 8
7
+ MAX_FRAMES = 32
8
+ NUM_FRAMES_PER_SECOND = 1
9
+ Grids = [(2, 2), (1, 2), (1, 3), (1, 4), (2, 1), (3, 1), (4, 1)]
10
+
11
+ # Model Constants
12
+ IGNORE_INDEX = -100
13
+ IMAGE_TOKEN_INDEX = -200
14
+ DEFAULT_IMAGE_TOKEN = "<image>"
15
+ DEFAULT_VIDEO_TOKEN = "<video>"
16
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
17
+ DEFAULT_IM_START_TOKEN = "<im_start>"
18
+ DEFAULT_IM_END_TOKEN = "<im_end>"
19
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
20
+
21
+
22
+ DEFAULT_IMAGE_TOKEN = "<image>"
23
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
24
+ DEFAULT_IM_START_TOKEN = "<im_start>"
25
+ DEFAULT_IM_END_TOKEN = "<im_end>"
26
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
27
+
28
+
29
+ MMODAL_TOKEN_INDEX = {"IMAGE": -200, "VIDEO": -201, "AUDIO": -202}
30
+ MMODAL_INDEX_TOKEN = {v: k for k, v in MMODAL_TOKEN_INDEX.items()}
31
+ MMODAL_START_TOKEN_INDEX = {"IMAGE": "<im_start>", "VIDEO": "<vid_start>", "AUDIO": "<ad_start>"}
32
+ MMODAL_END_TOKEN_INDEX = {"IMAGE": "<im_end>", "VIDEO": "<vid_end>", "AUDIO": "<ad_end>"}
33
+
34
+
35
+ DEFAULT_MMODAL_TOKEN = {"IMAGE": "<image>", "VIDEO": "<video>", "AUDIO": "<audio>"}
36
+ DEFAULT_MMODAL_PATCH_TOKEN = {"IMAGE": "<im_patch>", "VIDEO": "<vid_patch>", "AUDIO": "<ad_patch>"}
37
+ DEFAULT_MMODAL_START_TOKEN = {"IMAGE": "<Image>", "VIDEO": "<Video>", "AUDIO": "<ad_start>"}
38
+ DEFAULT_MMODAL_END_TOKEN = {"IMAGE": "<\Image>", "VIDEO": "<\Video>", "AUDIO": "<\Audio>"}
videollama2/conversation.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import dataclasses
3
+ from io import BytesIO
4
+ from enum import auto, Enum
5
+ from typing import List, Tuple
6
+
7
+ from PIL import Image
8
+ from .constants import LOGDIR, NUM_FRAMES
9
+
10
+
11
+ class SeparatorStyle(Enum):
12
+ """Different separator style."""
13
+ SINGLE = auto()
14
+ TWO = auto()
15
+ MPT = auto()
16
+ PLAIN = auto()
17
+ LLAMA_2 = auto()
18
+
19
+
20
+ @dataclasses.dataclass
21
+ class Conversation:
22
+ """A class that keeps all conversation history."""
23
+ system: str
24
+ roles: List[str]
25
+ messages: List[List[str]]
26
+ offset: int
27
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
28
+ sep: str = "###"
29
+ sep2: str = None
30
+ version: str = "Unknown"
31
+
32
+ skip_next: bool = False
33
+ modality: str = "image"
34
+
35
+ def get_prompt(self):
36
+ messages = self.messages
37
+ modality_token = f"<{self.modality}>"
38
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
39
+ messages = self.messages.copy()
40
+ init_role, init_msg = messages[0].copy()
41
+ init_msg = init_msg[0].replace(modality_token, "").strip()
42
+ if 'mmtag' in self.version:
43
+ messages[0] = (init_role, init_msg)
44
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
45
+ messages.insert(1, (self.roles[1], "Received."))
46
+ else:
47
+ messages[0] = (init_role, f"{modality_token}\n" + init_msg)
48
+
49
+ if self.sep_style == SeparatorStyle.SINGLE:
50
+ ret = self.system + self.sep
51
+ for role, message in messages:
52
+ if message:
53
+ if type(message) is tuple:
54
+ message, _, _ = message
55
+ ret += role + ": " + message + self.sep
56
+ else:
57
+ ret += role + ":"
58
+ elif self.sep_style == SeparatorStyle.TWO:
59
+ seps = [self.sep, self.sep2]
60
+ ret = self.system + seps[0]
61
+ for i, (role, message) in enumerate(messages):
62
+ if message:
63
+ if type(message) is tuple:
64
+ message, _, _ = message
65
+ ret += role + ": " + message + seps[i % 2]
66
+ else:
67
+ ret += role + ":"
68
+ elif self.sep_style == SeparatorStyle.MPT:
69
+ ret = self.system + self.sep
70
+ for role, message in messages:
71
+ if message:
72
+ if type(message) is tuple:
73
+ message, _, _ = message
74
+ ret += role + message + self.sep
75
+ else:
76
+ ret += role
77
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
78
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
79
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
80
+ ret = ""
81
+
82
+ for i, (role, message) in enumerate(messages):
83
+ if i == 0:
84
+ assert message, "first message should not be none"
85
+ assert role == self.roles[0], "first message should come from user"
86
+ if message:
87
+ if type(message) is tuple:
88
+ message, _, _ = message
89
+ if i == 0: message = wrap_sys(self.system) + message
90
+ if i % 2 == 0:
91
+ message = wrap_inst(message)
92
+ ret += self.sep + message
93
+ else:
94
+ ret += " " + message + " " + self.sep2
95
+ else:
96
+ ret += ""
97
+ ret = ret.lstrip(self.sep)
98
+ elif self.sep_style == SeparatorStyle.PLAIN:
99
+ seps = [self.sep, self.sep2]
100
+ ret = self.system
101
+ for i, (role, message) in enumerate(messages):
102
+ if message:
103
+ if type(message) is tuple:
104
+ message, _, _ = message
105
+ ret += message + seps[i % 2]
106
+ else:
107
+ ret += ""
108
+ else:
109
+ raise ValueError(f"Invalid style: {self.sep_style}")
110
+
111
+ return ret
112
+
113
+ def append_message(self, role, message):
114
+ self.messages.append([role, message])
115
+
116
+
117
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
118
+ if image_process_mode == "Pad":
119
+ def expand2square(pil_img, background_color=(122, 116, 104)):
120
+ width, height = pil_img.size
121
+ if width == height:
122
+ return pil_img
123
+ elif width > height:
124
+ result = Image.new(pil_img.mode, (width, width), background_color)
125
+ result.paste(pil_img, (0, (width - height) // 2))
126
+ return result
127
+ else:
128
+ result = Image.new(pil_img.mode, (height, height), background_color)
129
+ result.paste(pil_img, ((height - width) // 2, 0))
130
+ return result
131
+ image = expand2square(image)
132
+ elif image_process_mode in ["Default", "Crop"]:
133
+ pass
134
+ elif image_process_mode == "Resize":
135
+ image = image.resize((336, 336))
136
+ else:
137
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
138
+ if max(image.size) > max_len:
139
+ max_hw, min_hw = max(image.size), min(image.size)
140
+ aspect_ratio = max_hw / min_hw
141
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
142
+ longest_edge = int(shortest_edge * aspect_ratio)
143
+ W, H = image.size
144
+ if H > W:
145
+ H, W = longest_edge, shortest_edge
146
+ else:
147
+ H, W = shortest_edge, longest_edge
148
+ image = image.resize((W, H))
149
+ if return_pil:
150
+ return image
151
+ else:
152
+ buffered = BytesIO()
153
+ image.save(buffered, format=image_format)
154
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
155
+ return img_b64_str
156
+
157
+
158
+ def get_videos(self, return_pil=False):
159
+ video_frames = []
160
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
161
+ if i % 2 == 0:
162
+ if type(msg) is tuple:
163
+ from decord import VideoReader, cpu
164
+ import numpy as np
165
+ # here video is the file path of input video
166
+ msg, video, image_process_mode = msg
167
+ if not return_pil:
168
+ # return filepath
169
+ video_frames.append(video)
170
+ else:
171
+ # read video using decord.VideoReader
172
+ decord_vr = VideoReader(uri=video, ctx=cpu(0))
173
+ duration = len(decord_vr)
174
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
175
+ # convert the extracted image frames into PIL objects
176
+ all_images = [Image.fromarray(f) for f in decord_vr.get_batch(frame_id_list).asnumpy()]
177
+ video_frames.extend([self.process_image(image, image_process_mode, return_pil=return_pil) for image in all_images])
178
+ return video_frames
179
+
180
+
181
+ def get_images(self, return_pil=False):
182
+ images = []
183
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
184
+ if i % 2 == 0:
185
+ if type(msg) is tuple:
186
+ msg, image, image_process_mode = msg
187
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
188
+ images.append(image)
189
+
190
+ # import base64
191
+ # from io import BytesIO
192
+ # from PIL import Image
193
+ # # here image is a PIL object
194
+ # msg, image, image_process_mode = msg
195
+ # if image_process_mode == "Pad":
196
+ # def expand2square(pil_img, background_color=(122, 116, 104)):
197
+ # width, height = pil_img.size
198
+ # if width == height:
199
+ # return pil_img
200
+ # elif width > height:
201
+ # result = Image.new(pil_img.mode, (width, width), background_color)
202
+ # result.paste(pil_img, (0, (width - height) // 2))
203
+ # return result
204
+ # else:
205
+ # result = Image.new(pil_img.mode, (height, height), background_color)
206
+ # result.paste(pil_img, ((height - width) // 2, 0))
207
+ # return result
208
+ # image = expand2square(image)
209
+ # elif image_process_mode in ["Default", "Crop"]:
210
+ # pass
211
+ # elif image_process_mode == "Resize":
212
+ # image = image.resize((336, 336))
213
+ # else:
214
+ # raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
215
+ # max_hw, min_hw = max(image.size), min(image.size)
216
+ # aspect_ratio = max_hw / min_hw
217
+ # max_len, min_len = 800, 400
218
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
219
+ # longest_edge = int(shortest_edge * aspect_ratio)
220
+ # W, H = image.size
221
+ # if longest_edge != max(image.size):
222
+ # if H > W:
223
+ # H, W = longest_edge, shortest_edge
224
+ # else:
225
+ # H, W = shortest_edge, longest_edge
226
+ # image = image.resize((W, H))
227
+ # if return_pil:
228
+ # images.append(image)
229
+ # else:
230
+ # buffered = BytesIO()
231
+ # image.save(buffered, format="PNG")
232
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
233
+ # images.append(img_b64_str)
234
+ return images
235
+
236
+ def to_gradio_chatbot(self):
237
+ ret = []
238
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
239
+ if i % 2 == 0:
240
+ if type(msg) is tuple:
241
+ # import base64
242
+ # from io import BytesIO
243
+ # from PIL import Image
244
+ # msg, image, image_process_mode = msg
245
+ # max_hw, min_hw = max(image.size), min(image.size)
246
+ # aspect_ratio = max_hw / min_hw
247
+ # max_len, min_len = 800, 400
248
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
249
+ # longest_edge = int(shortest_edge * aspect_ratio)
250
+ # W, H = image.size
251
+ # if H > W:
252
+ # H, W = longest_edge, shortest_edge
253
+ # else:
254
+ # H, W = shortest_edge, longest_edge
255
+ # image = image.resize((W, H))
256
+ # buffered = BytesIO()
257
+ # image.save(buffered, format="JPEG")
258
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
259
+ # img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
260
+ # display image/video in the textbox
261
+ msg, image_or_video, image_process_mode = msg
262
+ ##print("imagebox:", image)
263
+ if isinstance(image_or_video, Image.Image):
264
+ # image is PIL object
265
+ img_b64_str = self.process_image(image_or_video, "Default", return_pil=False, image_format='JPEG')
266
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
267
+ msg = img_str + msg.replace('<image>', '').strip()
268
+ else:
269
+ # video is file path
270
+ vid_str = f'<video controls playsinline width="500" style="display: inline-block;" src="./file={image_or_video}"></video><br>'
271
+ msg = vid_str + msg.replace('<video>', '').strip()
272
+ ret.append([msg, None])
273
+ else:
274
+ ret.append([msg, None])
275
+ else:
276
+ ret[-1][-1] = msg
277
+ return ret
278
+
279
+ def copy(self):
280
+ return Conversation(
281
+ system=self.system,
282
+ roles=self.roles,
283
+ messages=[[x, y] for x, y in self.messages],
284
+ offset=self.offset,
285
+ sep_style=self.sep_style,
286
+ sep=self.sep,
287
+ sep2=self.sep2,
288
+ version=self.version)
289
+
290
+ def dict(self):
291
+ if (self.modality == "image" and len(self.get_images()) > 0) or \
292
+ (self.modality == "video" and len(self.get_videos()) > 0):
293
+ return {
294
+ "system": self.system,
295
+ "roles": self.roles,
296
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
297
+ "offset": self.offset,
298
+ "sep": self.sep,
299
+ "sep2": self.sep2,
300
+ "modality": self.modality
301
+ }
302
+ return {
303
+ "system": self.system,
304
+ "roles": self.roles,
305
+ "messages": self.messages,
306
+ "offset": self.offset,
307
+ "sep": self.sep,
308
+ "sep2": self.sep2,
309
+ }
310
+
311
+ conv_mistral_instruct = Conversation(
312
+ system="A chat between a curious user and an artificial intelligence assistant. "
313
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
314
+ roles=("USER", "ASSISTANT"),
315
+ version="llama_v2",
316
+ messages=(),
317
+ offset=0,
318
+ sep_style=SeparatorStyle.LLAMA_2,
319
+ sep="",
320
+ sep2="</s>",
321
+ )
322
+ conv_vicuna_v0 = Conversation(
323
+ system="A chat between a curious human and an artificial intelligence assistant. "
324
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
325
+ roles=("Human", "Assistant"),
326
+ messages=(
327
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
328
+ ("Assistant",
329
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
330
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
331
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
332
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
333
+ "renewable and non-renewable energy sources:\n"
334
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
335
+ "energy sources are finite and will eventually run out.\n"
336
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
337
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
338
+ "and other negative effects.\n"
339
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
340
+ "have lower operational costs than non-renewable sources.\n"
341
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
342
+ "locations than non-renewable sources.\n"
343
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
344
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
345
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
346
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
347
+ ),
348
+ offset=2,
349
+ sep_style=SeparatorStyle.SINGLE,
350
+ sep="###",
351
+ )
352
+
353
+ conv_vicuna_v1 = Conversation(
354
+ system="A chat between a curious user and an artificial intelligence assistant. "
355
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
356
+ roles=("USER", "ASSISTANT"),
357
+ version="v1",
358
+ messages=(),
359
+ offset=0,
360
+ sep_style=SeparatorStyle.TWO,
361
+ sep=" ",
362
+ sep2="</s>",
363
+ )
364
+
365
+ conv_llama_2 = Conversation(
366
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
367
+
368
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
369
+ roles=("USER", "ASSISTANT"),
370
+ version="llama_v2",
371
+ messages=(),
372
+ offset=0,
373
+ sep_style=SeparatorStyle.LLAMA_2,
374
+ sep="<s>",
375
+ sep2="</s>",
376
+ )
377
+
378
+ conv_llava_llama_2 = Conversation(
379
+ system="You are a helpful language and vision assistant. "
380
+ "You are able to understand the visual content that the user provides, "
381
+ "and assist the user with a variety of tasks using natural language.",
382
+ roles=("USER", "ASSISTANT"),
383
+ version="llama_v2",
384
+ messages=(),
385
+ offset=0,
386
+ sep_style=SeparatorStyle.LLAMA_2,
387
+ sep="<s>",
388
+ sep2="</s>",
389
+ )
390
+
391
+ conv_mpt = Conversation(
392
+ system="""<|im_start|>system
393
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
394
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
395
+ version="mpt",
396
+ messages=(),
397
+ offset=0,
398
+ sep_style=SeparatorStyle.MPT,
399
+ sep="<|im_end|>",
400
+ )
401
+
402
+ conv_llava_plain = Conversation(
403
+ system="",
404
+ roles=("", ""),
405
+ messages=(
406
+ ),
407
+ offset=0,
408
+ sep_style=SeparatorStyle.PLAIN,
409
+ sep="\n",
410
+ )
411
+
412
+ conv_llava_v0 = Conversation(
413
+ system="A chat between a curious human and an artificial intelligence assistant. "
414
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
415
+ roles=("Human", "Assistant"),
416
+ messages=(
417
+ ),
418
+ offset=0,
419
+ sep_style=SeparatorStyle.SINGLE,
420
+ sep="###",
421
+ )
422
+
423
+ conv_llava_v0_mmtag = Conversation(
424
+ system="A chat between a curious user and an artificial intelligence assistant. "
425
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
426
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
427
+ roles=("Human", "Assistant"),
428
+ messages=(
429
+ ),
430
+ offset=0,
431
+ sep_style=SeparatorStyle.SINGLE,
432
+ sep="###",
433
+ version="v0_mmtag",
434
+ )
435
+
436
+ conv_llava_v1 = Conversation(
437
+ system="A chat between a curious human and an artificial intelligence assistant. "
438
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
439
+ roles=("USER", "ASSISTANT"),
440
+ version="v1",
441
+ messages=(),
442
+ offset=0,
443
+ sep_style=SeparatorStyle.TWO,
444
+ sep=" ",
445
+ sep2="</s>",
446
+ )
447
+
448
+ conv_llava_v1_mmtag = Conversation(
449
+ system="A chat between a curious user and an artificial intelligence assistant. "
450
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
451
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
452
+ roles=("USER", "ASSISTANT"),
453
+ messages=(),
454
+ offset=0,
455
+ sep_style=SeparatorStyle.TWO,
456
+ sep=" ",
457
+ sep2="</s>",
458
+ version="v1_mmtag",
459
+ )
460
+
461
+ default_conversation = conv_vicuna_v1
462
+ conv_templates = {
463
+ "default": conv_vicuna_v0,
464
+ "v0": conv_vicuna_v0,
465
+ "v1": conv_vicuna_v1,
466
+ "vicuna_v1": conv_vicuna_v1,
467
+ "llama_2": conv_llama_2,
468
+
469
+ "plain": conv_llava_plain,
470
+ "v0_plain": conv_llava_plain,
471
+ "llava_v0": conv_llava_v0,
472
+ "v0_mmtag": conv_llava_v0_mmtag,
473
+ "llava_v1": conv_llava_v1,
474
+ "v1_mmtag": conv_llava_v1_mmtag,
475
+ "llava_llama_2": conv_llava_llama_2,
476
+
477
+ "video_llama_beta": conv_llava_llama_2,
478
+ "mistral_instruct": conv_mistral_instruct,
479
+ "mpt": conv_mpt,
480
+ }
481
+
482
+
483
+ if __name__ == "__main__":
484
+ print(default_conversation.get_prompt())
videollama2/eval/eval_benchmark_1_correctness.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3
40
+ Returns a score for correctness.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ message = [
51
+ {
52
+ "role": "system",
53
+ "content":
54
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
55
+ "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
56
+ "------"
57
+ "##INSTRUCTIONS: "
58
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
59
+ "- The predicted answer must be factually accurate and align with the video content.\n"
60
+ "- Consider synonyms or paraphrases as valid matches.\n"
61
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content":
66
+ "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answer: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}."
74
+ }
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ traceback.print_exc()
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for correctness:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_benchmark_2_detailed_orientation.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ from tqdm import tqdm
6
+ from multiprocessing.pool import Pool
7
+
8
+ from openai import AzureOpenAI
9
+
10
+
11
+ def init():
12
+ client = AzureOpenAI(
13
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
14
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
15
+ api_version="2024-02-15-preview"
16
+ )
17
+
18
+ return client
19
+
20
+
21
+ def interaction(client, message_text):
22
+ completion = client.chat.completions.create(
23
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
24
+ messages = message_text,
25
+ temperature=0.7,
26
+ max_tokens=800,
27
+ top_p=0.95,
28
+ frequency_penalty=0,
29
+ presence_penalty=0,
30
+ stop=None
31
+ )
32
+
33
+ return completion
34
+
35
+
36
+ def annotate(prediction_set, caption_files, output_dir, args):
37
+ """
38
+ Evaluates question and answer pairs using GPT-3 and
39
+ returns a score for detailed orientation.
40
+ """
41
+ for file in tqdm(caption_files):
42
+ key = file[:-5] # Strip file extension
43
+ qa_set = prediction_set[key]
44
+ question = qa_set['q']
45
+ answer = qa_set['a']
46
+ pred = qa_set['p']
47
+ try:
48
+ # Compute the detailed-orientation score
49
+ message = [
50
+ {
51
+ "role": "system",
52
+ "content":
53
+ "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
54
+ "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
55
+ "------"
56
+ "##INSTRUCTIONS: "
57
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
58
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
59
+ "- Consider synonyms or paraphrases as valid matches.\n"
60
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
61
+ },
62
+ {
63
+ "role": "user",
64
+ "content":
65
+ "Please evaluate the following video-based question-answer pair:\n\n"
66
+ f"Question: {question}\n"
67
+ f"Correct Answer: {answer}\n"
68
+ f"Predicted Answer: {pred}\n\n"
69
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
70
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
71
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
72
+ "For example, your response should look like this: {''score': 4.8}."
73
+ }
74
+ ]
75
+
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ print(f"Error: {e}")
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for detailed orientation:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_benchmark_3_context.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for contextual understanding.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ # Compute the contextual understanding score
51
+ message = [
52
+ {
53
+ "role": "system",
54
+ "content":
55
+ "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
60
+ "- The predicted answer must capture the main themes and sentiments of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content":
67
+ "Please evaluate the following video-based question-answer pair:\n\n"
68
+ f"Question: {question}\n"
69
+ f"Correct Answer: {answer}\n"
70
+ f"Predicted Answer: {pred}\n\n"
71
+ "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
72
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
73
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
74
+ "For example, your response should look like this: {''score': 4.8}."
75
+ }
76
+ ]
77
+
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+
84
+ # Save the question-answer pairs to a json file.
85
+ with open(f"{output_dir}/{key}.json", "w") as f:
86
+ json.dump(result_qa_pair, f)
87
+
88
+ except Exception as e:
89
+ print(f"Error processing file '{key}': {e}")
90
+
91
+
92
+ def main(args):
93
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
94
+
95
+ # Dictionary to store the count of occurrences for each video_id
96
+ video_id_counts = {}
97
+ new_pred_contents = []
98
+
99
+ # Iterate through each sample in pred_contents
100
+ for sample in pred_contents:
101
+ video_id = sample['video_name']
102
+ if video_id in video_id_counts:
103
+ video_id_counts[video_id] += 1
104
+ else:
105
+ video_id_counts[video_id] = 0
106
+
107
+ # Create a new sample with the modified key
108
+ new_sample = sample
109
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
110
+ new_pred_contents.append(new_sample)
111
+
112
+ # Generating list of id's and corresponding files
113
+ id_list = [x['video_name'] for x in new_pred_contents]
114
+ caption_files = [f"{id}.json" for id in id_list]
115
+
116
+ output_dir = args.output_dir
117
+ # Generate output directory if not exists.
118
+ if not os.path.exists(output_dir):
119
+ os.makedirs(output_dir)
120
+
121
+ # Preparing dictionary of question-answer sets
122
+ prediction_set = {}
123
+ for sample in new_pred_contents:
124
+ id = sample['video_name']
125
+ question = sample['Q']
126
+ answer = sample['A']
127
+ pred = sample['P']
128
+ qa_set = {"q": question, "a": answer, "p": pred}
129
+ prediction_set[id] = qa_set
130
+
131
+ # Set the OpenAI API key.
132
+ # openai.api_key = args.api_key
133
+ num_tasks = args.num_tasks
134
+
135
+ # While loop to ensure that all captions are processed.
136
+ while True:
137
+ try:
138
+ # Files that have not been processed yet.
139
+ completed_files = os.listdir(output_dir)
140
+ print(f"completed_files: {len(completed_files)}")
141
+
142
+ # Files that have not been processed yet.
143
+ incomplete_files = [f for f in caption_files if f not in completed_files]
144
+ print(f"incomplete_files: {len(incomplete_files)}")
145
+
146
+ # Break the loop when there are no incomplete files
147
+ if len(incomplete_files) == 0:
148
+ break
149
+ if len(incomplete_files) <= num_tasks:
150
+ num_tasks = 1
151
+
152
+ # Split tasks into parts.
153
+ part_len = len(incomplete_files) // num_tasks
154
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
155
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
156
+
157
+ # Use a pool of workers to process the files in parallel.
158
+ with Pool() as pool:
159
+ pool.starmap(annotate, task_args)
160
+
161
+ except Exception as e:
162
+ print(f"Error: {e}")
163
+
164
+ # Combine all the processed files into one
165
+ combined_contents = {}
166
+ json_path = args.output_json
167
+
168
+ # Iterate through json files
169
+ for file_name in tqdm(os.listdir(output_dir)):
170
+ if file_name.endswith(".json"):
171
+ file_path = os.path.join(output_dir, file_name)
172
+ with open(file_path, "r") as json_file:
173
+ content = json.load(json_file)
174
+ combined_contents[file_name[:-5]] = content
175
+
176
+ # Write combined content to a json file
177
+ with open(json_path, "w") as json_file:
178
+ json.dump(combined_contents, json_file)
179
+ print("All evaluation completed!")
180
+
181
+ # Calculate average score
182
+ score_sum = 0
183
+ count = 0
184
+ for key, result in combined_contents.items():
185
+ count += 1
186
+ score_match = result[0]['score']
187
+ score = int(score_match)
188
+ score_sum += score
189
+ average_score = score_sum / count
190
+
191
+ print("Average score for contextual understanding:", average_score)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
196
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
197
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
198
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
199
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
200
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
201
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
202
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
203
+ args = parser.parse_args()
204
+
205
+ # Set the OpenAI API key.
206
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
207
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
208
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
209
+
210
+ client = init()
211
+
212
+ main(args)
videollama2/eval/eval_benchmark_4_temporal.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+
39
+ for file in tqdm(caption_files):
40
+ key = file[:-5] # Strip file extension
41
+ qa_set = prediction_set[key]
42
+ question = qa_set['q']
43
+ answer = qa_set['a']
44
+ pred = qa_set['p']
45
+ try:
46
+ message = [
47
+ {
48
+ "role": "system",
49
+ "content":
50
+ "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
51
+ "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
52
+ "------"
53
+ "##INSTRUCTIONS: "
54
+ "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
55
+ "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
56
+ "- Evaluate the temporal accuracy of the prediction compared to the answer."
57
+ },
58
+ {
59
+ "role": "user",
60
+ "content":
61
+ "Please evaluate the following video-based question-answer pair:\n\n"
62
+ f"Question: {question}\n"
63
+ f"Correct Answer: {answer}\n"
64
+ f"Predicted Answer: {pred}\n\n"
65
+ "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
66
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
67
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
68
+ "For example, your response should look like this: {''score': 4.8}."
69
+ }
70
+ ]
71
+
72
+ completion = interaction(client, message)
73
+ # Convert response to a Python dictionary.
74
+ response_message = completion.choices[0].message.content
75
+ response_dict = ast.literal_eval(response_message)
76
+ result_qa_pair = [response_dict, qa_set]
77
+
78
+ # Save the question-answer pairs to a json file.
79
+ with open(f"{output_dir}/{key}.json", "w") as f:
80
+ json.dump(result_qa_pair, f)
81
+
82
+ except Exception as e:
83
+ print(f"Error processing file '{key}': {e}")
84
+
85
+
86
+ def main(args):
87
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
88
+
89
+ # Dictionary to store the count of occurrences for each video_id
90
+ video_id_counts = {}
91
+ new_pred_contents = []
92
+
93
+ # Iterate through each sample in pred_contents
94
+ for sample in pred_contents:
95
+ video_id = sample['video_name']
96
+ if video_id in video_id_counts:
97
+ video_id_counts[video_id] += 1
98
+ else:
99
+ video_id_counts[video_id] = 0
100
+
101
+ # Create a new sample with the modified key
102
+ new_sample = sample
103
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
104
+ new_pred_contents.append(new_sample)
105
+
106
+ # Generating list of id's and corresponding files
107
+ id_list = [x['video_name'] for x in new_pred_contents]
108
+ caption_files = [f"{id}.json" for id in id_list]
109
+
110
+ output_dir = args.output_dir
111
+ # Generate output directory if not exists.
112
+ if not os.path.exists(output_dir):
113
+ os.makedirs(output_dir)
114
+
115
+ # Preparing dictionary of question-answer sets
116
+ prediction_set = {}
117
+ for sample in new_pred_contents:
118
+ id = sample['video_name']
119
+ question = sample['Q']
120
+ answer = sample['A']
121
+ pred = sample['P']
122
+ qa_set = {"q": question, "a": answer, "p": pred}
123
+ prediction_set[id] = qa_set
124
+
125
+ # Set the OpenAI API key.
126
+ # openai.api_key = args.api_key
127
+ num_tasks = args.num_tasks
128
+
129
+ # While loop to ensure that all captions are processed.
130
+ while True:
131
+ try:
132
+ # Files that have not been processed yet.
133
+ completed_files = os.listdir(output_dir)
134
+ print(f"completed_files: {len(completed_files)}")
135
+
136
+ # Files that have not been processed yet.
137
+ incomplete_files = [f for f in caption_files if f not in completed_files]
138
+ print(f"incomplete_files: {len(incomplete_files)}")
139
+
140
+ # Break the loop when there are no incomplete files
141
+ if len(incomplete_files) == 0:
142
+ break
143
+ if len(incomplete_files) <= num_tasks:
144
+ num_tasks = 1
145
+
146
+ # Split tasks into parts.
147
+ part_len = len(incomplete_files) // num_tasks
148
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
149
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
150
+
151
+ # Use a pool of workers to process the files in parallel.
152
+ with Pool() as pool:
153
+ pool.starmap(annotate, task_args)
154
+
155
+ except Exception as e:
156
+ print(f"Error: {e}")
157
+
158
+ # Combine all the processed files into one
159
+ combined_contents = {}
160
+ json_path = args.output_json
161
+
162
+ # Iterate through json files
163
+ for file_name in os.listdir(output_dir):
164
+ if file_name.endswith(".json"):
165
+ file_path = os.path.join(output_dir, file_name)
166
+ with open(file_path, "r") as json_file:
167
+ content = json.load(json_file)
168
+ combined_contents[file_name[:-5]] = content
169
+
170
+ # Write combined content to a json file
171
+ with open(json_path, "w") as json_file:
172
+ json.dump(combined_contents, json_file)
173
+ print("All evaluation completed!")
174
+
175
+ # Calculate average score
176
+ score_sum = 0
177
+ count = 0
178
+ for key, result in combined_contents.items():
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+ average_score = score_sum / count
184
+
185
+ print("Average score temporal understanding:", average_score)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
190
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
191
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
192
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
193
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
194
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
195
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
196
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
197
+ args = parser.parse_args()
198
+
199
+ # Set the OpenAI API key.
200
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
201
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
202
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
203
+
204
+ client = init()
205
+
206
+ main(args)
videollama2/eval/eval_benchmark_5_consistency.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for consistency.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question1 = qa_set['q1']
47
+ question2 = qa_set['q2']
48
+ answer = qa_set['a']
49
+ pred1 = qa_set['p1']
50
+ pred2 = qa_set['p2']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
57
+ "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
58
+ "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
59
+ "------"
60
+ "##INSTRUCTIONS: "
61
+ "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
62
+ "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
63
+ "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
64
+ "- Evaluate the consistency of the two predicted answers compared to the correct answer."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content":
69
+ "Please evaluate the following video-based question-answer pair:\n\n"
70
+ f"Question 1: {question1}\n"
71
+ f"Question 2: {question2}\n"
72
+ f"Correct Answer: {answer}\n"
73
+ f"Predicted Answer to Question 1: {pred1}\n"
74
+ f"Predicted Answer to Question 2: {pred2}\n\n"
75
+ "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
76
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
77
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
78
+ "For example, your response should look like this: {''score': 4.8}."
79
+ }
80
+ ]
81
+
82
+ completion = interaction(client, message)
83
+ # Convert response to a Python dictionary.
84
+ response_message = completion.choices[0].message.content
85
+ response_dict = ast.literal_eval(response_message)
86
+ result_qa_pair = [response_dict, qa_set]
87
+
88
+ # Save the question-answer pairs to a json file.
89
+ with open(f"{output_dir}/{key}.json", "w") as f:
90
+ json.dump(result_qa_pair, f)
91
+
92
+ except Exception as e:
93
+ print(f"Error processing file '{key}': {e}")
94
+
95
+
96
+ def main(args):
97
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
98
+
99
+ # Dictionary to store the count of occurrences for each video_id
100
+ video_id_counts = {}
101
+ new_pred_contents = []
102
+
103
+ # Iterate through each sample in pred_contents
104
+ for sample in pred_contents:
105
+ video_id = sample['video_name']
106
+ if video_id in video_id_counts:
107
+ video_id_counts[video_id] += 1
108
+ else:
109
+ video_id_counts[video_id] = 0
110
+
111
+ # Create a new sample with the modified key
112
+ new_sample = sample
113
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
114
+ new_pred_contents.append(new_sample)
115
+
116
+ # Generating list of id's and corresponding files
117
+ id_list = [x['video_name'] for x in new_pred_contents]
118
+ caption_files = [f"{id}.json" for id in id_list]
119
+
120
+ output_dir = args.output_dir
121
+ # Generate output directory if not exists.
122
+ if not os.path.exists(output_dir):
123
+ os.makedirs(output_dir)
124
+
125
+ # Preparing dictionary of question-answer sets
126
+ prediction_set = {}
127
+ for sample in new_pred_contents:
128
+ id = sample['video_name']
129
+ question1 = sample['Q1']
130
+ question2 = sample['Q2']
131
+ answer = sample['A']
132
+ pred1 = sample['P1']
133
+ pred2 = sample['P2']
134
+ qa_set = {"q1": question1, "q2": question2, "a": answer, "p1": pred1, "p2": pred2}
135
+ prediction_set[id] = qa_set
136
+
137
+ # Set the OpenAI API key.
138
+ # openai.api_key = args.api_key
139
+ num_tasks = args.num_tasks
140
+
141
+ # While loop to ensure that all captions are processed.
142
+ while True:
143
+ try:
144
+ # Files that have not been processed yet.
145
+ completed_files = os.listdir(output_dir)
146
+ print(f"completed_files: {len(completed_files)}")
147
+
148
+ # Files that have not been processed yet.
149
+ incomplete_files = [f for f in caption_files if f not in completed_files]
150
+ print(f"incomplete_files: {len(incomplete_files)}")
151
+
152
+ # Break the loop when there are no incomplete files
153
+ if len(incomplete_files) == 0:
154
+ break
155
+ if len(incomplete_files) <= num_tasks:
156
+ num_tasks = 1
157
+
158
+ # Split tasks into parts.
159
+ part_len = len(incomplete_files) // num_tasks
160
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
161
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
162
+
163
+ # Use a pool of workers to process the files in parallel.
164
+ with Pool() as pool:
165
+ pool.starmap(annotate, task_args)
166
+
167
+ except Exception as e:
168
+ print(f"Error: {e}")
169
+
170
+ # Combine all the processed files into one
171
+ combined_contents = {}
172
+ json_path = args.output_json
173
+
174
+ # Iterate through json files
175
+ for file_name in os.listdir(output_dir):
176
+ if file_name.endswith(".json"):
177
+ file_path = os.path.join(output_dir, file_name)
178
+ with open(file_path, "r") as json_file:
179
+ content = json.load(json_file)
180
+ combined_contents[file_name[:-5]] = content
181
+
182
+ # Write combined content to a json file
183
+ with open(json_path, "w") as json_file:
184
+ json.dump(combined_contents, json_file)
185
+ print("All evaluation completed!")
186
+
187
+ # Calculate average score
188
+ score_sum = 0
189
+ count = 0
190
+ for key, result in combined_contents.items():
191
+ count += 1
192
+ score_match = result[0]['score']
193
+ score = int(score_match)
194
+ score_sum += score
195
+ average_score = score_sum / count
196
+
197
+ print("Average score for consistency:", average_score)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
202
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
203
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
204
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
205
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
206
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
207
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
208
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
209
+ args = parser.parse_args()
210
+
211
+ # Set the OpenAI API key.
212
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
213
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
214
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
215
+
216
+ client = init()
217
+
218
+ main(args)
videollama2/eval/eval_video_qa_gpt.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import time
5
+ import argparse
6
+ import traceback
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ def init():
14
+ client = AzureOpenAI(
15
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
16
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
17
+ api_version="2024-02-15-preview"
18
+ )
19
+
20
+ return client
21
+
22
+
23
+ def interaction(client, message_text):
24
+ completion = client.chat.completions.create(
25
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
26
+ messages = message_text,
27
+ temperature=0.7,
28
+ max_tokens=800,
29
+ top_p=0.95,
30
+ frequency_penalty=0,
31
+ presence_penalty=0,
32
+ stop=None
33
+ )
34
+
35
+ return completion
36
+
37
+
38
+ def prompt_gpt(question, answer, pred, key, qa_set, output_dir):
39
+ message = [
40
+ {
41
+ "role": "system",
42
+ "content":
43
+ "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
44
+ "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
45
+ "------"
46
+ "##INSTRUCTIONS: "
47
+ "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
48
+ "- Consider synonyms or paraphrases as valid matches.\n"
49
+ "- Evaluate the correctness of the prediction compared to the answer."
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content":
54
+ "Please evaluate the following video-based question-answer pair:\n\n"
55
+ f"Question: {question}\n"
56
+ f"Correct Answer: {answer}\n"
57
+ f"Predicted Answer: {pred}\n\n"
58
+ "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
59
+ "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
60
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
61
+ "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
62
+ }
63
+ ]
64
+ completion = interaction(client, message)
65
+ # Convert response to a Python dictionary.
66
+ response_message = completion.choices[0].message.content
67
+ response_dict = ast.literal_eval(response_message)
68
+ result_qa_pair = [response_dict, qa_set]
69
+ # # Save the question-answer pairs to a json file.
70
+ with open(f"{output_dir}/{key}.json", "w") as f:
71
+ json.dump(result_qa_pair, f)
72
+
73
+
74
+ def annotate(prediction_set, caption_files, output_dir, args):
75
+ """
76
+ Evaluates question and answer pairs using GPT-3
77
+ Returns a score for correctness.
78
+ """
79
+
80
+ for file in tqdm(caption_files):
81
+ key = file[:-5] # Strip file extension
82
+ qa_set = prediction_set[key]
83
+ question = qa_set['q']
84
+ answer = qa_set['a']
85
+ pred = qa_set['p']
86
+ try:
87
+ prompt_gpt(question, answer, pred, key, qa_set, output_dir)
88
+ except Exception as e:
89
+ traceback.print_exc()
90
+ prompt_gpt(question, answer, pred[:50], key, qa_set, output_dir)
91
+
92
+ time.sleep(1)
93
+
94
+
95
+ def main(args):
96
+
97
+ file = open(args.pred_path)
98
+ new_pred_contents = [eval(i.strip()) for i in file.readlines()]
99
+
100
+ # Generating list of id's and corresponding files
101
+ id_list = [x['id'] for x in new_pred_contents]
102
+ caption_files = [f"{id}.json" for id in id_list]
103
+
104
+ output_dir = args.output_dir
105
+ # Generate output directory if not exists.
106
+ if not os.path.exists(output_dir):
107
+ os.makedirs(output_dir)
108
+
109
+ # Preparing dictionary of question-answer sets
110
+ prediction_set = {}
111
+ for sample in new_pred_contents:
112
+ id = sample['id']
113
+ question = sample['question']
114
+ answer = sample['answer']
115
+ pred = sample['pred']
116
+ qa_set = {"q": question, "a": answer, "p": pred}
117
+ prediction_set[id] = qa_set
118
+
119
+ num_tasks = args.num_tasks
120
+
121
+ # While loop to ensure that all captions are processed.
122
+ while True:
123
+ try:
124
+ # Files that have not been processed yet.
125
+ completed_files = os.listdir(output_dir)
126
+ print(f"completed_files: {len(completed_files)}")
127
+
128
+ # Files that have not been processed yet.
129
+ incomplete_files = [f for f in caption_files if f not in completed_files]
130
+ print(f"incomplete_files: {len(incomplete_files)}")
131
+
132
+ # Break the loop when there are no incomplete files
133
+ if len(incomplete_files) == 0:
134
+ break
135
+ if len(incomplete_files) <= num_tasks:
136
+ num_tasks = 1
137
+
138
+ # Split tasks into parts.
139
+ part_len = len(incomplete_files) // num_tasks
140
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
141
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
142
+
143
+ # Use a pool of workers to process the files in parallel.
144
+ with Pool() as pool:
145
+ pool.starmap(annotate, task_args)
146
+
147
+ except Exception as e:
148
+ print(f"Error: {e}")
149
+
150
+ # Combine all the processed files into one
151
+ combined_contents = {}
152
+ json_path = args.output_json
153
+
154
+ # Iterate through json files
155
+ for file_name in tqdm(os.listdir(output_dir)):
156
+ if file_name.endswith(".json"):
157
+ file_path = os.path.join(output_dir, file_name)
158
+ with open(file_path, "r") as json_file:
159
+ try:
160
+ content = json.load(json_file)
161
+ except:
162
+ print(json_file)
163
+ exit(0)
164
+ combined_contents[file_name[:-5]] = content
165
+
166
+ # Write combined content to a json file
167
+ with open(json_path, "w") as json_file:
168
+ json.dump(combined_contents, json_file)
169
+ print("All evaluation completed!")
170
+
171
+ # Calculate average score and accuracy
172
+ score_sum = 0
173
+ count = 0
174
+ yes_count = 0
175
+ no_count = 0
176
+ for key, result in tqdm(combined_contents.items()):
177
+ try:
178
+ # Computing score
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+
184
+ # Computing accuracy
185
+ pred = result[0]['pred']
186
+ if "yes" in pred.lower():
187
+ yes_count += 1
188
+ elif "no" in pred.lower():
189
+ no_count += 1
190
+ except:
191
+ print(result)
192
+
193
+ average_score = score_sum / count
194
+ accuracy = yes_count / (yes_count + no_count)
195
+ print("Yes count:", yes_count)
196
+ print("No count:", no_count)
197
+ print("Accuracy:", accuracy)
198
+ print("Average score:", average_score)
199
+
200
+
201
+ if __name__ == "__main__":
202
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
203
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
204
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
205
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
206
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
207
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
208
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
209
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
210
+ args = parser.parse_args()
211
+
212
+ # Set the OpenAI API key.
213
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
214
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
215
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
216
+
217
+ client = init()
218
+
219
+ main(args)
videollama2/eval/eval_video_qa_mvbench.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tabulate import tabulate
4
+
5
+
6
+ tasks = {
7
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
8
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
9
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
10
+ "Fine-grained Action": ("fine_grained_action.json", "pMoments_in_Time_Raw/videos/", "video", False),
11
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
12
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
13
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
14
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
15
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
16
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
17
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
18
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
19
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
20
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
21
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
22
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
23
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
24
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
25
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
26
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
27
+ }
28
+
29
+
30
+ def main():
31
+ args = parse_args()
32
+ res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
33
+ task_types = tasks.keys()
34
+ task_acc = {x: [] for x in task_types}
35
+ acc = []
36
+ for i, x in enumerate(res):
37
+ value = 1
38
+ if x['pred'] != x['gt']:
39
+ value = 0
40
+ acc.append(value)
41
+ task_acc[x['task_type']].append(value)
42
+ acc = sum(acc) * 100 / len(acc)
43
+ task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
44
+ print(f"{args.pred_path}:", acc)
45
+ task_names = list(tasks.keys())
46
+
47
+ table_data = []
48
+ for i in range(len(task_names) // 4):
49
+ row_task_names = task_names[i * 4: (i + 1) * 4]
50
+ row_task_acc = [task_acc[x] for x in row_task_names]
51
+ table_data.append(row_task_names)
52
+ table_data.append(row_task_acc)
53
+ print(tabulate(table_data, floatfmt=".1f"), '\n')
54
+
55
+
56
+ def parse_args():
57
+ parser = argparse.ArgumentParser(description="Evaluate video captioning.")
58
+ parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
59
+ args = parser.parse_args()
60
+ return args
61
+
62
+
63
+ if __name__ == '__main__':
64
+ main()
videollama2/eval/run_inference_video_qa_batch.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+
8
+ import torch
9
+ import decord
10
+ import numpy as np
11
+ import transformers
12
+ from PIL import Image
13
+ from tqdm import tqdm
14
+ from decord import VideoReader, cpu
15
+ from torch.utils.data import Dataset, DataLoader
16
+ from torchvision import transforms as T
17
+ from torchvision.transforms import functional as F
18
+
19
+ import sys
20
+ sys.path.append('./')
21
+ from videollama2.conversation import conv_templates, SeparatorStyle
22
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
23
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos, expand2square
24
+ from videollama2.model.builder import load_pretrained_model
25
+
26
+
27
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
28
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
29
+
30
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
31
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
32
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
33
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
34
+
35
+
36
+ def split_list(lst, n):
37
+ """Split a list into n (roughly) equal-sized chunks"""
38
+ chunk_size = math.ceil(len(lst) / n) # integer division
39
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
40
+
41
+
42
+ def get_chunk(lst, n, k):
43
+ chunks = split_list(lst, n)
44
+ return chunks[k]
45
+
46
+
47
+ class MVBenchDataset(Dataset):
48
+
49
+ def __init__(self, data_list, processor, num_segments=8):
50
+ self.data_list = data_list
51
+
52
+ self.decord_method = {
53
+ 'video': self.read_video,
54
+ 'gif': self.read_gif,
55
+ 'frame': self.read_frame,
56
+ }
57
+
58
+ self.processor = processor
59
+ self.num_segments = num_segments
60
+
61
+ def __str__(self):
62
+ len_list = {}
63
+ option_list = {}
64
+ for data in self.data_list:
65
+ if data['task_type'] not in len_list:
66
+ len_list[data['task_type']] = 0
67
+ len_list[data['task_type']] += 1
68
+ if data['task_type'] not in option_list:
69
+ option_list[data['task_type']] = 0
70
+ option_list[data['task_type']] += len(data['data']['candidates'])
71
+
72
+ correct = 0
73
+ total = 0
74
+ res = f"There are {len(self.data_list)} videos as follow:\n"
75
+ for k, v in len_list.items():
76
+ correct += len_list[k]
77
+ total += option_list[k]
78
+ res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
79
+ correct = correct + 1 / option_list[k]
80
+ res += f"Total random accuracy: {correct/total*100:.2f}%"
81
+ return res.rstrip()
82
+
83
+ def __len__(self):
84
+ return len(self.data_list)
85
+
86
+ def get_index(self, bound, fps, max_frame, first_idx=0):
87
+ if bound:
88
+ start, end = bound[0], bound[1]
89
+ else:
90
+ start, end = -100000, 100000
91
+ start_idx = max(first_idx, round(start * fps))
92
+ end_idx = min(round(end * fps), max_frame)
93
+ seg_size = float(end_idx - start_idx) / self.num_segments
94
+ frame_indices = np.array([
95
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
96
+ for idx in range(self.num_segments)
97
+ ])
98
+ return frame_indices
99
+
100
+ def read_video(self, video_path, bound=None):
101
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
102
+ max_frame = len(vr) - 1
103
+ fps = float(vr.get_avg_fps())
104
+
105
+ images_group = list()
106
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
107
+ for frame_index in frame_indices:
108
+ img = Image.fromarray(vr[frame_index].asnumpy())
109
+ images_group.append(img)
110
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
111
+ torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
112
+ return torch_imgs
113
+
114
+ def read_gif(self, video_path, bound=None, fps=25):
115
+ gif = imageio.get_reader(video_path)
116
+ max_frame = len(gif) - 1
117
+
118
+ images_group = list()
119
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
120
+ for index, frame in enumerate(gif):
121
+ if index in frame_indices:
122
+ img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
123
+ img = Image.fromarray(img)
124
+ images_group.append(img)
125
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
126
+ torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
127
+ return torch_imgs
128
+
129
+ def read_frame(self, video_path, bound=None, fps=3):
130
+ max_frame = len(os.listdir(video_path))
131
+ images_group = list()
132
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
133
+ for frame_index in frame_indices:
134
+ img = Image.open(os.path.join(video_path, f"{frame_index:05d}.jpg"))
135
+ images_group.append(img)
136
+ # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
137
+ torch_imgs = self.processor.preprocess(images_group, return_tensors='pt')['pixel_values']
138
+ return torch_imgs
139
+
140
+ def qa_template(self, data):
141
+ question = f"Question: {data['question']}\n"
142
+ question += "Options:\n"
143
+ answer = data['answer']
144
+ answer_idx = -1
145
+ for idx, c in enumerate(data['candidates']):
146
+ question += f"({chr(ord('A') + idx)}) {c}\n"
147
+ if c == answer:
148
+ answer_idx = idx
149
+ question = question.rstrip()
150
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
151
+ return question, answer
152
+
153
+ def __getitem__(self, idx):
154
+ decord_method = self.decord_method[self.data_list[idx]['data_type']]
155
+ bound = None
156
+ if self.data_list[idx]['bound']:
157
+ bound = (
158
+ self.data_list[idx]['data']['start'],
159
+ self.data_list[idx]['data']['end'],
160
+ )
161
+ video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
162
+ torch_imgs = decord_method(video_path, bound)
163
+ question = self.data_list[idx]['data']['question']
164
+ options = self.data_list[idx]['data']['candidates']
165
+ answer = self.data_list[idx]['data']['answer']
166
+ task_type = self.data_list[idx]['task_type']
167
+
168
+ # question, answer = self.qa_template(self.data_list[idx]['data'])
169
+
170
+ answer_idx = -1
171
+ letters = []
172
+ options_string = ''
173
+ for option_idx, c in enumerate(options):
174
+ letters.append(f"{chr(ord('A') + option_idx)}")
175
+ options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
176
+ if c == answer:
177
+ answer_idx = option_idx
178
+
179
+ option_question = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.'
180
+
181
+ return {
182
+ 'video': torch_imgs,
183
+ 'video_path': video_path,
184
+ 'question': option_question,
185
+ 'letters': ','.join(letters),
186
+ 'answer_idx': answer_idx,
187
+ 'task_type': task_type
188
+ }
189
+
190
+
191
+ tasks = {
192
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
193
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
194
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
195
+ "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
196
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
197
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
198
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
199
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
200
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
201
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
202
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
203
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
204
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
205
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
206
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
207
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
208
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
209
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
210
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
211
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
212
+ }
213
+
214
+
215
+ def build_mvbench_eval(args, processor, num_frames):
216
+ data_list = []
217
+ for task_name, task in tasks.items():
218
+ json_file = os.path.join(args.question_file, task[0])
219
+ vis_folder = os.path.join(args.video_folder, task[1])
220
+ with open(json_file, 'r') as f:
221
+ json_data = json.load(f)
222
+ for data in json_data:
223
+ data_list.append({
224
+ 'task_type': task_name,
225
+ 'prefix': vis_folder,
226
+ 'data_type': task[2],
227
+ 'bound': task[3],
228
+ 'data': data
229
+ })
230
+ data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
231
+ dataset = MVBenchDataset(data_list, processor, num_segments=num_frames)
232
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
233
+
234
+ return dataloader
235
+
236
+
237
+ def mvbench_dump(ans_file, line, outputs):
238
+ for idx, output in enumerate(outputs):
239
+ vid = line['video_path'][idx]
240
+ task_type = line['task_type'][idx]
241
+ letters = line['letters'][idx].split(',')
242
+ answer_idx = line['answer_idx'][idx].item()
243
+
244
+ pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
245
+ if len(pred_answer) == 0:
246
+ pred_idx = (answer_idx + 1) % len(letters)
247
+ else:
248
+ pred_answer = pred_answer[0].strip()
249
+ if pred_answer.startswith('('):
250
+ pred_answer = pred_answer.strip('()')
251
+ pred_idx = letters.index(pred_answer)
252
+
253
+ ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
254
+
255
+
256
+ class NextoeDataset(Dataset):
257
+
258
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
259
+
260
+ def __init__(self, data_list, processor, num_segments=8):
261
+ self.data_list = data_list
262
+ self.processor = processor
263
+ self.num_segments = num_segments
264
+
265
+ def __len__(self):
266
+ return len(self.data_list)
267
+
268
+ def __getitem__(self, idx):
269
+ line = self.data_list[idx]
270
+ video_name = line['video']
271
+ question = line['question']
272
+ answer = line['answer']
273
+
274
+ for fmt in self.video_formats: # Added this line
275
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
276
+ if os.path.exists(temp_path):
277
+ video_path = temp_path
278
+ break
279
+
280
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
281
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
282
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
283
+
284
+ wrapped_question = f'Question: {question}\nAnswer the question using a single word or a short phrase with multiple words.'
285
+
286
+ return {
287
+ 'video': video_tensor,
288
+ 'question': wrapped_question,
289
+ 'answer': answer,
290
+ 'qid': line['qid']
291
+ }
292
+
293
+
294
+ def build_nextoe_eval(args, processor, num_frames):
295
+ questions = json.load(open(args.question_file, "r"))
296
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
297
+ dataset = NextoeDataset(questions, processor, num_segments=num_frames)
298
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
299
+
300
+ return dataloader
301
+
302
+
303
+ def nextoe_dump(ans_file, line, outputs):
304
+ for idx, output in enumerate(outputs):
305
+ vid, qid = line['qid'][idx].split('_')
306
+ ans_file.write(json.dumps({"vid": vid, "qid": qid, "prediction": output}) + '\n')
307
+
308
+
309
+ class NextqaDataset(Dataset):
310
+
311
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
312
+
313
+ def __init__(self, data_list, processor, num_segments=8):
314
+ self.data_list = data_list
315
+ self.processor = processor
316
+ self.num_segments = num_segments
317
+
318
+ def __len__(self):
319
+ return len(self.data_list)
320
+
321
+ def __getitem__(self, idx):
322
+ line = self.data_list[idx]
323
+ video_name = line['video']
324
+ question = line['question']
325
+ answer = line['answer']
326
+
327
+ for fmt in self.video_formats: # Added this line
328
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
329
+ if os.path.exists(temp_path):
330
+ video_path = temp_path
331
+ break
332
+
333
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
334
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
335
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
336
+
337
+ assert line['num_option'] == 5
338
+ a0 = line['a0']
339
+ a1 = line['a1']
340
+ a2 = line['a2']
341
+ a3 = line['a3']
342
+ a4 = line['a4']
343
+
344
+ option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
345
+
346
+ return {
347
+ 'video': video_tensor,
348
+ 'question': option_question,
349
+ 'answer': answer,
350
+ 'qid': line['qid']
351
+ }
352
+
353
+
354
+ def build_nextqa_eval(args, processor, num_frames):
355
+ questions = json.load(open(args.question_file, "r"))
356
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
357
+ dataset = NextqaDataset(questions, processor, num_segments=num_frames)
358
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
359
+
360
+ return dataloader
361
+
362
+
363
+ def nextqa_dump(ans_file, line, outputs):
364
+ for idx, output in enumerate(outputs):
365
+ qid = line['qid'][idx]
366
+ answer = line['answer'][idx].item()
367
+
368
+ letters = ['A', 'B', 'C', 'D', 'E']
369
+
370
+ pred_answer = re.findall('[\(,\ ]*[A-E][\),\ ]*', output)
371
+ if len(pred_answer) == 0:
372
+ pred_idx = 2
373
+ else:
374
+ pred_answer = pred_answer[0].strip()
375
+ if pred_answer.startswith('('):
376
+ pred_answer = pred_answer.strip('()')
377
+ pred_idx = letters.index(pred_answer)
378
+
379
+ ans_file.write(json.dumps({"id": qid, "prediction": pred_idx, "answer": answer}) + '\n')
380
+
381
+
382
+ class EgoschemaDataset(Dataset):
383
+
384
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
385
+
386
+ def __init__(self, data_list, processor, num_segments=8):
387
+ self.data_list = data_list
388
+ self.processor = processor
389
+ self.num_segments = num_segments
390
+
391
+ def __len__(self):
392
+ return len(self.data_list)
393
+
394
+ def __getitem__(self, idx):
395
+ line = self.data_list[idx]
396
+ q_uid = line['q_uid']
397
+
398
+ for fmt in self.video_formats: # Added this line
399
+ temp_path = os.path.join(args.video_folder, f"{q_uid}{fmt}")
400
+ if os.path.exists(temp_path):
401
+ video_path = temp_path
402
+ break
403
+
404
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
405
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
406
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
407
+
408
+ question = line['question']
409
+ a0 = line['option 0']
410
+ a1 = line['option 1']
411
+ a2 = line['option 2']
412
+ a3 = line['option 3']
413
+ a4 = line['option 4']
414
+ axs = [a0, a1, a2, a3, a4]
415
+ ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
416
+
417
+ option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\n.Answer with the option\'s letter from the given choices directly and only give the best option.'
418
+
419
+ return {
420
+ 'q_uid': q_uid,
421
+ 'video': video_tensor,
422
+ 'question': option_question,
423
+ }
424
+
425
+
426
+ def build_egoschema_eval(args, processor, num_frames):
427
+ questions = json.load(open(args.question_file, "r"))
428
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
429
+ dataset = EgoschemaDataset(questions, processor, num_segments=num_frames)
430
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
431
+
432
+ return dataloader
433
+
434
+
435
+ def egoschema_dump(ans_file, line, outputs):
436
+ for idx, output in enumerate(outputs):
437
+ q_uid = line['q_uid'][idx]
438
+ letters = ['A', 'B', 'C', 'D', 'E']
439
+
440
+ pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
441
+ if len(pred_answer) == 0:
442
+ pred_idx = 2
443
+ else:
444
+ pred_answer = pred_answer[0].strip()
445
+ # if pred_answer.startswith('('):
446
+ pred_answer = pred_answer.strip('()')
447
+ pred_idx = letters.index(pred_answer)
448
+ ans_file.write(f'{q_uid}, {pred_idx}\n')
449
+
450
+
451
+ def get_model_output(model, video_tensor, tokenizer, questions, conv_mode="v1", device='cuda'):
452
+
453
+ input_ids = []
454
+ modal_list = []
455
+ for qs in questions:
456
+ if model.config.mm_use_im_start_end:
457
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
458
+ else:
459
+ qs = default_mm_token + "\n" + qs
460
+
461
+ conv = conv_templates[conv_mode].copy()
462
+ conv.append_message(conv.roles[0], qs)
463
+ conv.append_message(conv.roles[1], None)
464
+ prompt = conv.get_prompt()
465
+
466
+ input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
467
+ input_ids.append(input_id)
468
+ modal_list.append("video")
469
+
470
+ # left pad sequence
471
+ input_ids = torch.nn.utils.rnn.pad_sequence(
472
+ [x.flip(dims=[0]) for x in input_ids],
473
+ batch_first=True,
474
+ padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
475
+
476
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
477
+
478
+ video_tensor = video_tensor.half().to(args.device)
479
+
480
+ with torch.inference_mode():
481
+ output_ids = model.generate(
482
+ input_ids,
483
+ attention_mask=attention_mask,
484
+ images_or_videos=video_tensor,
485
+ modal_list=modal_list,
486
+ do_sample=False,
487
+ max_new_tokens=1024,
488
+ use_cache=True,
489
+ pad_token_id=tokenizer.eos_token_id)
490
+
491
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
492
+ return outputs
493
+
494
+
495
+ def run_inference(args):
496
+ """
497
+ Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
498
+
499
+ Args:
500
+ args: Command-line arguments.
501
+ """
502
+ # Initialize the model
503
+ model_name = get_model_name_from_path(args.model_path)
504
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
505
+
506
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
507
+
508
+ answer_file = os.path.expanduser(args.answer_file)
509
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
510
+ ans_file = open(answer_file, "w")
511
+
512
+ output_list = [] # List to store the output results
513
+
514
+ if args.dataset == 'mvbench':
515
+ val_loader = build_mvbench_eval(args, processor, num_frames)
516
+ elif args.dataset == 'nextoe':
517
+ val_loader = build_nextoe_eval(args, processor, num_frames)
518
+ elif args.dataset == 'nextqa':
519
+ val_loader = build_nextqa_eval(args, processor, num_frames)
520
+ elif args.dataset == 'egoschema':
521
+ val_loader = build_egoschema_eval(args, processor, num_frames)
522
+ else:
523
+ raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
524
+
525
+ # Iterate over each sample in the ground truth file
526
+ for i, line in enumerate(tqdm(val_loader)):
527
+ video_tensor = line['video']
528
+ questions = line['question']
529
+
530
+ outputs = get_model_output(model, video_tensor, tokenizer, questions, args.conv_mode, args.device)
531
+
532
+ if args.dataset == 'mvbench':
533
+ mvbench_dump(ans_file, line, outputs)
534
+ elif args.dataset == 'nextoe':
535
+ nextoe_dump(ans_file, line, outputs)
536
+ elif args.dataset == 'nextqa':
537
+ nextqa_dump(ans_file, line, outputs)
538
+ elif args.dataset == 'egoschema':
539
+ egoschema_dump(ans_file, line, outputs)
540
+ else:
541
+ raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
542
+
543
+ ans_file.close()
544
+
545
+
546
+ if __name__ == "__main__":
547
+ parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
548
+
549
+ parser.add_argument('--dataset', help='Dataset to evaluate on.', required=True)
550
+ parser.add_argument('--model-path', help='', required=True)
551
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
552
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
553
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
554
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
555
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
556
+ parser.add_argument("--num-chunks", type=int, default=1)
557
+ parser.add_argument("--chunk-idx", type=int, default=0)
558
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
559
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
560
+ parser.add_argument("--batch-size", type=int, default=1)
561
+ parser.add_argument("--num-workers", type=int, default=8)
562
+ args = parser.parse_args()
563
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import argparse
4
+ import json
5
+ import warnings
6
+ from tqdm import tqdm
7
+
8
+ import torch
9
+ import numpy as np
10
+ import transformers
11
+ import decord
12
+ from decord import VideoReader, cpu
13
+
14
+ import sys
15
+ sys.path.append('./')
16
+ from videollama2.conversation import conv_templates, SeparatorStyle
17
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
18
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
19
+ from videollama2.model.builder import load_pretrained_model
20
+
21
+
22
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
23
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
24
+
25
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
26
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
27
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
28
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
29
+
30
+
31
+ def split_list(lst, n):
32
+ """Split a list into n (roughly) equal-sized chunks"""
33
+ chunk_size = math.ceil(len(lst) / n) # integer division
34
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
35
+
36
+
37
+ def get_chunk(lst, n, k):
38
+ chunks = split_list(lst, n)
39
+ return chunks[k]
40
+
41
+
42
+ def get_model_output(model, tokenizer, video_tensor, questions, conv_mode="v1", device='cuda'):
43
+
44
+ input_ids = []
45
+ modal_list = []
46
+ for qs in questions:
47
+ if model.config.mm_use_im_start_end:
48
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
49
+ else:
50
+ qs = default_mm_token + "\n" + qs
51
+
52
+ conv = conv_templates[conv_mode].copy()
53
+ conv.append_message(conv.roles[0], qs)
54
+ conv.append_message(conv.roles[1], None)
55
+ prompt = conv.get_prompt()
56
+
57
+ input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
58
+ input_ids.append(input_id)
59
+ modal_list.append("video")
60
+
61
+ # left pad sequence
62
+ input_ids = torch.nn.utils.rnn.pad_sequence(
63
+ [x.flip(dims=[0]) for x in input_ids],
64
+ batch_first=True,
65
+ padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
66
+
67
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
68
+
69
+ video_tensor = video_tensor.half().to(args.device)
70
+
71
+ with torch.inference_mode():
72
+ output_ids = model.generate(
73
+ input_ids,
74
+ attention_mask=attention_mask,
75
+ images_or_videos=video_tensor,
76
+ modal_list=modal_list,
77
+ do_sample=False,
78
+ max_new_tokens=1024,
79
+ use_cache=True,
80
+ pad_token_id=tokenizer.eos_token_id)
81
+
82
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
83
+ return outputs
84
+
85
+
86
+ def run_inference(args):
87
+ # Initialize the model
88
+ model_name = get_model_name_from_path(args.model_path)
89
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
90
+
91
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
92
+
93
+ gt_questions = json.load(open(args.question_file, "r"))
94
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
95
+ gt_answers = json.load(open(args.answer_file, "r"))
96
+ gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
97
+
98
+ answer_file = os.path.join(args.output_file)
99
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
100
+ ans_file = open(answer_file, "w")
101
+
102
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
103
+
104
+ # Iterate over each sample in the ground truth file
105
+ for idx, sample in enumerate(tqdm(gt_questions)):
106
+ video_name = sample['video_name']
107
+ question = sample['question']
108
+ id = sample['question_id']
109
+ answer = gt_answers[idx]['answer']
110
+
111
+ # Load the video file
112
+ for fmt in video_formats: # Added this line
113
+ temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
114
+ if os.path.exists(temp_path):
115
+ video_path = temp_path
116
+ break
117
+ # BUG: compatibility for MSVD, MSRVTT, TGIF
118
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
119
+ if os.path.exists(temp_path):
120
+ video_path = temp_path
121
+ break
122
+
123
+ # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
124
+
125
+ video_tensor = process_video(video_path, processor, aspect_ratio=None, sample_scheme='uniform', num_frames=num_frames)
126
+ output = get_model_output(model, tokenizer, video_tensor[None], [question], args.conv_mode, args.device)[0]
127
+
128
+ sample_set = {'id': id, 'question': question, 'answer': answer, 'pred': output}
129
+ ans_file.write(json.dumps(sample_set) + "\n")
130
+
131
+ ans_file.close()
132
+
133
+
134
+ if __name__ == "__main__":
135
+ parser = argparse.ArgumentParser()
136
+
137
+ # Define the command-line arguments
138
+ parser.add_argument('--model-path', help='', required=True)
139
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
140
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
141
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
142
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
143
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
144
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
145
+ parser.add_argument("--num-chunks", type=int, default=1)
146
+ parser.add_argument("--chunk-idx", type=int, default=0)
147
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
148
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
149
+
150
+ args = parser.parse_args()
151
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt_consistency.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class VCGPTDataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_frames):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_frames = num_frames
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ question1 = line['Q1']
59
+ question2 = line['Q2']
60
+ answer = line['A']
61
+ video_name = line['video_name']
62
+
63
+ for fmt in self.video_formats: # Added this line
64
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
65
+ if os.path.exists(temp_path):
66
+ video_path = temp_path
67
+ break
68
+
69
+ video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
70
+
71
+ return {
72
+ 'video': video_tensor,
73
+ 'video_name': video_name,
74
+ 'question1': question1,
75
+ 'question2': question2,
76
+ 'answer': answer,
77
+ }
78
+
79
+
80
+ def collate_fn(batch):
81
+ vid = [x['video'] for x in batch]
82
+ v_id = [x['video_name'] for x in batch]
83
+ qus1 = [x['question1'] for x in batch]
84
+ qus2 = [x['question2'] for x in batch]
85
+ ans = [x['answer'] for x in batch]
86
+ vid = torch.stack(vid, dim=0)
87
+ return vid, v_id, qus1, qus2, ans
88
+
89
+
90
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
91
+ if model.config.mm_use_im_start_end:
92
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
93
+ else:
94
+ qs = default_mm_token + "\n" + qs
95
+
96
+ conv = conv_templates[args.conv_mode].copy()
97
+ conv.append_message(conv.roles[0], qs)
98
+ conv.append_message(conv.roles[1], None)
99
+ prompt = conv.get_prompt()
100
+
101
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
102
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
103
+
104
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
105
+
106
+ modal_list = ["video"]
107
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
108
+
109
+ with torch.inference_mode():
110
+ output_ids = model.generate(
111
+ input_ids.unsqueeze(0),
112
+ attention_mask=attention_mask.unsqueeze(0),
113
+ images_or_videos=[video_tensor],
114
+ modal_list=modal_list,
115
+ do_sample=False,
116
+ max_new_tokens=1024,
117
+ use_cache=True,
118
+ pad_token_id=tokenizer.eos_token_id)
119
+
120
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
121
+ return outputs
122
+
123
+
124
+ def run_inference(args):
125
+ model_name = get_model_name_from_path(args.model_path)
126
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
127
+
128
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
129
+
130
+ questions = json.load(open(args.question_file, "r"))
131
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
132
+
133
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
134
+ dataset = VCGPTDataset(questions, processor, num_frames)
135
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
136
+
137
+ answer_file = os.path.expanduser(args.answer_file)
138
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
139
+ ans_file = open(answer_file, "w")
140
+
141
+ output_list = [] # List to store the output results
142
+
143
+ # Iterate over each sample in the ground truth file
144
+ for i, (video_tensors, video_names, questions1, questions2, answers) in enumerate(tqdm(dataloader)):
145
+
146
+ # reduce batch dimension
147
+ video_tensor = video_tensors[0]
148
+ video_name = video_names[0]
149
+ question1 = questions1[0]
150
+ question2 = questions2[0]
151
+ answer = answers[0]
152
+
153
+ output1 = get_model_output(model, tokenizer, question1, video_tensor, args)
154
+ output2 = get_model_output(model, tokenizer, question2, video_tensor, args)
155
+
156
+ qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
157
+
158
+ ans_file.write(json.dumps(qa) + "\n")
159
+
160
+ ans_file.close()
161
+
162
+
163
+ if __name__ == "__main__":
164
+ parser = argparse.ArgumentParser()
165
+
166
+ # Define the command-line arguments
167
+ parser.add_argument('--model-path', help='', required=True)
168
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
169
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
170
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
171
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
172
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
173
+ parser.add_argument("--num-chunks", type=int, default=1)
174
+ parser.add_argument("--chunk-idx", type=int, default=0)
175
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
176
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
177
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
178
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
179
+
180
+ args = parser.parse_args()
181
+
182
+ run_inference(args)
videollama2/eval/run_inference_video_qa_gpt_general.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class VCGPTDataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_frames):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_frames = num_frames
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ question = line['Q']
59
+ answer = line['A']
60
+ video_name = line['video_name']
61
+
62
+ for fmt in self.video_formats: # Added this line
63
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
64
+ if os.path.exists(temp_path):
65
+ video_path = temp_path
66
+ break
67
+
68
+ video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
69
+
70
+ return {
71
+ 'video': video_tensor,
72
+ 'video_name': video_name,
73
+ 'question': question,
74
+ 'answer': answer,
75
+ }
76
+
77
+
78
+ def collate_fn(batch):
79
+ vid = [x['video'] for x in batch]
80
+ v_id = [x['video_name'] for x in batch]
81
+ qus = [x['question'] for x in batch]
82
+ ans = [x['answer'] for x in batch]
83
+ vid = torch.stack(vid, dim=0)
84
+ return vid, v_id, qus, ans
85
+
86
+
87
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
88
+ if model.config.mm_use_im_start_end:
89
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
90
+ else:
91
+ qs = default_mm_token + "\n" + qs
92
+
93
+ conv = conv_templates[args.conv_mode].copy()
94
+ conv.append_message(conv.roles[0], qs)
95
+ conv.append_message(conv.roles[1], None)
96
+ prompt = conv.get_prompt()
97
+
98
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
99
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
100
+
101
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
102
+
103
+ modal_list = ["video"]
104
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
105
+
106
+ with torch.inference_mode():
107
+ output_ids = model.generate(
108
+ input_ids.unsqueeze(0),
109
+ attention_mask=attention_mask.unsqueeze(0),
110
+ images_or_videos=[video_tensor],
111
+ modal_list=modal_list,
112
+ do_sample=False,
113
+ max_new_tokens=1024,
114
+ use_cache=True,
115
+ pad_token_id=tokenizer.eos_token_id)
116
+
117
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
118
+ return outputs
119
+
120
+
121
+ def run_inference(args):
122
+ model_name = get_model_name_from_path(args.model_path)
123
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
124
+
125
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
126
+
127
+ questions = json.load(open(args.question_file, "r"))
128
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
129
+
130
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
131
+ dataset = VCGPTDataset(questions, processor, num_frames)
132
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
133
+
134
+ answer_file = os.path.expanduser(args.answer_file)
135
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
136
+ ans_file = open(answer_file, "w")
137
+
138
+ output_list = [] # List to store the output results
139
+
140
+ # Iterate over each sample in the ground truth file
141
+ for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
142
+
143
+ # reduce batch dimension
144
+ video_tensor = video_tensors[0]
145
+ video_name = video_names[0]
146
+ question = questions[0]
147
+ answer = answers[0]
148
+
149
+ output = get_model_output(model, tokenizer, question, video_tensor, args)
150
+
151
+ qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
152
+
153
+ ans_file.write(json.dumps(qa) + "\n")
154
+
155
+ ans_file.close()
156
+
157
+
158
+ if __name__ == "__main__":
159
+ parser = argparse.ArgumentParser()
160
+
161
+ # Define the command-line arguments
162
+ parser.add_argument('--model-path', help='', required=True)
163
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
164
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
165
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
166
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
167
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
168
+ parser.add_argument("--num-chunks", type=int, default=1)
169
+ parser.add_argument("--chunk-idx", type=int, default=0)
170
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
171
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
172
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
173
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
174
+
175
+ args = parser.parse_args()
176
+
177
+ run_inference(args)
videollama2/eval/run_inference_video_qa_perception_test_mcqa.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import decord
11
+ import numpy as np
12
+ import transformers
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2.conversation import conv_templates, SeparatorStyle
19
+ from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
20
+ from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos
21
+ from videollama2.model.builder import load_pretrained_model
22
+
23
+
24
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
25
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
26
+
27
+ default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
28
+ default_mm_start_token = DEFAULT_MMODAL_START_TOKEN["VIDEO"]
29
+ default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
30
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
31
+
32
+
33
+ def split_list(lst, n):
34
+ """Split a list into n (roughly) equal-sized chunks"""
35
+ chunk_size = math.ceil(len(lst) / n) # integer division
36
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
37
+
38
+
39
+ def get_chunk(lst, n, k):
40
+ chunks = split_list(lst, n)
41
+ return chunks[k]
42
+
43
+
44
+ class PerceptionTestMCQADataset(Dataset):
45
+
46
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
47
+
48
+ def __init__(self, data_list, processor, num_segments=8):
49
+ self.data_list = data_list
50
+ self.processor = processor
51
+ self.num_segments = num_segments
52
+
53
+ def __len__(self):
54
+ return len(self.data_list)
55
+
56
+ def __getitem__(self, idx):
57
+ line = self.data_list[idx]
58
+ video_name = line['metadata']['video_id']
59
+ mc_questions = line['mc_question']
60
+
61
+ for fmt in self.video_formats: # Added this line
62
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
63
+ if os.path.exists(temp_path):
64
+ video_path = temp_path
65
+ break
66
+
67
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
68
+ frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
69
+ video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
70
+
71
+ qs = []
72
+ qids = []
73
+ ops = []
74
+ for q in mc_questions:
75
+ question = q['question']
76
+ qid = q['id']
77
+ options = q['options']
78
+ option_question = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
79
+
80
+ qs.append(option_question)
81
+ qids.append(qid)
82
+ ops.append(options)
83
+
84
+ return {
85
+ 'video': video_tensor,
86
+ 'video_id': video_name,
87
+ 'questions': qs,
88
+ 'question_ids': qids,
89
+ 'options': ops,
90
+ }
91
+
92
+
93
+ def collate_fn(batch):
94
+ vid = [x['video'] for x in batch]
95
+ v_id = [x['video_id'] for x in batch]
96
+ qs = [x['questions'] for x in batch]
97
+ q_ids = [x['question_ids'] for x in batch]
98
+ ops = [x['options'] for x in batch]
99
+ vid = torch.stack(vid, dim=0)
100
+ return vid, v_id, qs, q_ids, ops
101
+
102
+
103
+ def get_model_output(model, tokenizer, qs, video_tensor, args):
104
+ if model.config.mm_use_im_start_end:
105
+ qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
106
+ else:
107
+ qs = default_mm_token + "\n" + qs
108
+
109
+ conv = conv_templates[args.conv_mode].copy()
110
+ conv.append_message(conv.roles[0], qs)
111
+ conv.append_message(conv.roles[1], None)
112
+ prompt = conv.get_prompt()
113
+
114
+ # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
115
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
116
+
117
+ attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
118
+
119
+ modal_list = ["video"]
120
+ video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
121
+
122
+ with torch.inference_mode():
123
+ output_ids = model.generate(
124
+ input_ids.unsqueeze(0),
125
+ attention_mask=attention_mask.unsqueeze(0),
126
+ images_or_videos=[video_tensor],
127
+ modal_list=modal_list,
128
+ do_sample=False,
129
+ max_new_tokens=1024,
130
+ use_cache=True,
131
+ pad_token_id=tokenizer.eos_token_id)
132
+
133
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
134
+ return outputs
135
+
136
+
137
+ def run_inference(args):
138
+ # Initialize the model
139
+ model_name = get_model_name_from_path(args.model_path)
140
+ tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
141
+
142
+ questions = json.load(open(args.question_file, "r"))
143
+ questions = list(questions.values())
144
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
145
+
146
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
147
+
148
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
149
+ dataset = PerceptionTestMCQADataset(questions, processor, num_frames)
150
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
151
+
152
+ answer_file = os.path.expanduser(args.answer_file)
153
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
154
+ ans_file = open(answer_file, "w")
155
+
156
+ output_list = [] # List to store the output results
157
+
158
+ # Iterate over each sample in the ground truth file
159
+ for i, (video_tensor, video_id, questions, question_ids, options) in enumerate(tqdm(dataloader)):
160
+
161
+ # reduce batch dimension
162
+ video_tensor = video_tensor[0]
163
+ video_id = video_id[0]
164
+ questions = questions[0]
165
+ question_ids = question_ids[0]
166
+ options = options[0]
167
+
168
+ qas = []
169
+ for idx, question in enumerate(questions):
170
+ letters = ['(A)', '(B)', '(C)']
171
+ question_id = question_ids[idx]
172
+ _options = options[idx]
173
+
174
+ output = get_model_output(model, tokenizer, question, video_tensor, args)
175
+ pred_answer = re.findall('\(*[A-C]\)*', output)
176
+ if len(pred_answer) == 0:
177
+ tmp_options = [x.lower() for x in _options]
178
+ if output.lower() in tmp_options:
179
+ tmp_options = [x.lower() for x in _options]
180
+ pred_idx = tmp_options.index(output.lower())
181
+ else:
182
+ pred_idx = 2
183
+ else:
184
+ pred_answer = pred_answer[0].strip()
185
+ if not pred_answer.startswith('('):
186
+ pred_answer = f'({pred_answer})'
187
+ pred_idx = letters.index(pred_answer)
188
+
189
+ qas.append({'id': question_id, 'answer_id': pred_idx, 'answer': _options[pred_idx]})
190
+
191
+ ans_file.write('\"{}\": {},\n'.format(video_id, json.dumps(qas)))
192
+
193
+ ans_file.close()
194
+
195
+
196
+ if __name__ == "__main__":
197
+ parser = argparse.ArgumentParser()
198
+
199
+ # Define the command-line arguments
200
+ parser.add_argument('--model-path', help='', required=True)
201
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
202
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
203
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
204
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
205
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
206
+ parser.add_argument("--num-chunks", type=int, default=1)
207
+ parser.add_argument("--chunk-idx", type=int, default=0)
208
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
209
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
210
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
211
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
212
+ args = parser.parse_args()
213
+
214
+ run_inference(args)
videollama2/mm_utils.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import math
3
+ import base64
4
+ from io import BytesIO
5
+
6
+ import torch
7
+ import decord
8
+ import imageio
9
+ import numpy as np
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+ from moviepy.editor import VideoFileClip
13
+ from transformers import StoppingCriteria
14
+
15
+ from scenedetect import open_video, SceneManager
16
+ from scenedetect.detectors import ContentDetector
17
+ from scenedetect.stats_manager import StatsManager
18
+
19
+ from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MMODAL_INDEX_TOKEN, IMAGE_TOKEN_INDEX
20
+
21
+
22
+ def merge_scenes(cut_list, cut_scores, scene_list,num_frames,max_scene_num=4, num_frame_per_scene=8, min_frames_per_scene=30):
23
+ if len(scene_list) == len(cut_list) and len(scene_list) == 0:
24
+ frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int) # only one scene for current video
25
+ return [frame_ids]
26
+
27
+ scene_list, cut_results = merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores,scene_list, max_scene_num)
28
+
29
+ prev_cut_point = 0
30
+ list_of_scene_frames = []
31
+ for (cur_cut_point, _) in cut_results:
32
+ frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
33
+ list_of_scene_frames.append(frame_ids)
34
+ prev_cut_point = cur_cut_point
35
+ if cur_cut_point < num_frames:
36
+ frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
37
+ list_of_scene_frames.append(frame_ids)
38
+
39
+ return list_of_scene_frames
40
+
41
+
42
+ def merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores, scene_list, max_scene_num):
43
+ cut_frames = [ele.get_frames() for ele in cut_list]
44
+ cut_results = list(zip(cut_frames, cut_scores))
45
+ while len(scene_list) > max_scene_num:
46
+ min_idx = np.argmin(cut_scores)
47
+ cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
48
+ cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
49
+
50
+ # merge scene list
51
+ num_scenes = len(scene_list)
52
+ #print("Current min_idx:", min_idx)
53
+ s1 = scene_list[min_idx]
54
+ s2 = scene_list[min_idx+1]
55
+ new_scene = (s1[0], s2[1])
56
+ if min_idx == 0:
57
+ # merge the first two scenes
58
+ new_scene_list = [new_scene] + scene_list[2:]
59
+ elif min_idx == num_scenes - 1:
60
+ # # merge the last two scenes
61
+ new_scene_list = scene_list[:min_idx-1] + [new_scene]
62
+ else:
63
+ new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
64
+ scene_list = new_scene_list
65
+ cut_results = list(zip(cut_frames, cut_scores))
66
+ return scene_list, cut_results
67
+
68
+
69
+ def split_video_into_scenes(video_path, threshold=27.0, max_scene_num=10, num_frame_per_scene=8):
70
+ # Open video, create a scene manager, and add a detector.
71
+ video = open_video(video_path)
72
+ stats_manager = StatsManager()
73
+ scene_manager = SceneManager(stats_manager)
74
+ detector = ContentDetector(threshold=threshold)
75
+ scene_manager.add_detector(detector)
76
+ scene_manager.detect_scenes(video)
77
+ scene_list = scene_manager.get_scene_list()
78
+ cut_list = scene_manager.get_cut_list()
79
+ num_frames = video.duration.get_frames()
80
+ if len(scene_list) == len(cut_list) and len(scene_list) == 0:
81
+ frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int) # only one scene for current video
82
+ return [frame_ids]
83
+ assert len(scene_list) == len(cut_list) + 1, f"inconsistent lengths for scene list ({len(scene_list)}) vs. cut list ({len(cut_list)})"
84
+ cut_frames = [ele.get_frames() for ele in cut_list]
85
+ cut_scores = [stats_manager.get_metrics(f, ["delta_lum"])[0] for f in cut_frames]
86
+ cut_results = list(zip(cut_frames, cut_scores))
87
+ #print(f"Original cut scores: {cut_scores}, original scene list: {scene_list}")
88
+ while len(scene_list) > max_scene_num:
89
+ min_idx = np.argmin(cut_scores)
90
+ cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
91
+ cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
92
+
93
+ # merge scene list
94
+ num_scenes = len(scene_list)
95
+ #print("Current min_idx:", min_idx)
96
+ s1 = scene_list[min_idx]
97
+ s2 = scene_list[min_idx+1]
98
+ new_scene = (s1[0], s2[1])
99
+ if min_idx == 0:
100
+ # merge the first two scenes
101
+ new_scene_list = [new_scene] + scene_list[2:]
102
+ elif min_idx == num_scenes - 1:
103
+ # # merge the last two scenes
104
+ new_scene_list = scene_list[:min_idx-1] + [new_scene]
105
+ else:
106
+ new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
107
+ scene_list = new_scene_list
108
+ cut_results = list(zip(cut_frames, cut_scores))
109
+ #print(f"Cut scores after merging: {cut_scores}, scene list: {scene_list}")
110
+ prev_cut_point = 0
111
+ list_of_scene_frames = []
112
+ for (cur_cut_point, _) in cut_results:
113
+ frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
114
+ list_of_scene_frames.append(frame_ids)
115
+ prev_cut_point = cur_cut_point
116
+ if cur_cut_point < num_frames:
117
+ frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
118
+ list_of_scene_frames.append(frame_ids)
119
+ # print(f"Finally got {len(list_of_scene_frames)} scenes where we evenly sampled {num_frame_per_scene} frames for each scene")
120
+ return list_of_scene_frames
121
+
122
+
123
+ def select_best_resolution(original_size, possible_resolutions):
124
+ """
125
+ Selects the best resolution from a list of possible resolutions based on the original size.
126
+ Args:
127
+ original_size (tuple): The original size of the image in the format (width, height).
128
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
129
+ Returns:
130
+ tuple: The best fit resolution in the format (width, height).
131
+ """
132
+ original_width, original_height = original_size
133
+ best_fit = None
134
+ max_effective_resolution = 0
135
+ min_wasted_resolution = float('inf')
136
+ for width, height in possible_resolutions:
137
+ scale = min(width / original_width, height / original_height)
138
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
139
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
140
+ wasted_resolution = (width * height) - effective_resolution
141
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
142
+ max_effective_resolution = effective_resolution
143
+ min_wasted_resolution = wasted_resolution
144
+ best_fit = (width, height)
145
+ return best_fit
146
+
147
+
148
+ def resize_and_pad_image(image, target_resolution):
149
+ """
150
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
151
+ Args:
152
+ image (PIL.Image.Image): The input image.
153
+ target_resolution (tuple): The target resolution (width, height) of the image.
154
+ Returns:
155
+ PIL.Image.Image: The resized and padded image.
156
+ """
157
+ original_width, original_height = image.size
158
+ target_width, target_height = target_resolution
159
+ scale_w = target_width / original_width
160
+ scale_h = target_height / original_height
161
+ if scale_w < scale_h:
162
+ new_width = target_width
163
+ new_height = min(math.ceil(original_height * scale_w), target_height)
164
+ else:
165
+ new_height = target_height
166
+ new_width = min(math.ceil(original_width * scale_h), target_width)
167
+ # Resize the image
168
+ resized_image = image.resize((new_width, new_height))
169
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
170
+ paste_x = (target_width - new_width) // 2
171
+ paste_y = (target_height - new_height) // 2
172
+ new_image.paste(resized_image, (paste_x, paste_y))
173
+ return new_image
174
+
175
+
176
+ def divide_to_patches(image, patch_size):
177
+ """
178
+ Divides an image into patches of a specified size.
179
+ Args:
180
+ image (PIL.Image.Image): The input image.
181
+ patch_size (int): The size of each patch.
182
+ Returns:
183
+ list: A list of PIL.Image.Image objects representing the patches.
184
+ """
185
+ patches = []
186
+ width, height = image.size
187
+ for i in range(0, height, patch_size):
188
+ for j in range(0, width, patch_size):
189
+ box = (j, i, j + patch_size, i + patch_size)
190
+ patch = image.crop(box)
191
+ patches.append(patch)
192
+ return patches
193
+
194
+
195
+ def get_anyres_image_grid_shape(image_size, grids, patch_size):
196
+ """
197
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
198
+ Args:
199
+ image_size (tuple): The size of the input image in the format (width, height).
200
+ grids (str, List[tuple[int]]): Patch segmentation grid.
201
+ patch_size (int): The size of each image patch.
202
+ Returns:
203
+ tuple: The shape of the image patch grid in the format (width, height).
204
+ """
205
+ if type(grids) is list:
206
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
207
+ else:
208
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
209
+ width, height = select_best_resolution(image_size, possible_resolutions)
210
+ return width // patch_size, height // patch_size
211
+
212
+
213
+ def process_anyres_image(image, grids, patch_size):
214
+ """
215
+ Process an image with variable resolutions.
216
+ Args:
217
+ image (PIL.Image.Image): The input image to be processed.
218
+ grids (str, List[tuple[int]]): Patch segmentation grid.
219
+ patch_size (int): The size of the patches to be extracted.
220
+ Returns:
221
+ torch.Tensor: A tensor containing the processed image patches.
222
+ """
223
+ if type(grids) is list:
224
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
225
+ else:
226
+ possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
227
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
228
+ image_padded = resize_and_pad_image(image, best_resolution)
229
+ patches = divide_to_patches(image_padded, patch_size)
230
+ image_original_resize = resize_and_pad_image(image, (patch_size, patch_size))
231
+ image_patches = [image_original_resize] + patches
232
+ return image_patches
233
+
234
+
235
+ def chunk_list(input_list, chunk_size):
236
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
237
+
238
+
239
+ def frame_expansion(frame_list, n):
240
+ assert len(frame_list) == n * n
241
+ width, height = frame_list[0].width, frame_list[0].height
242
+ expanded_width = n * width
243
+ expanded_height = n * height
244
+ expanded_frame = Image.new('RGB', (expanded_width, expanded_height))
245
+ for i in range(n):
246
+ for j in range(n):
247
+ frame = frame_list[i * n + j]
248
+ coordinate = (j*width, i*height)
249
+ expanded_frame.paste(frame, coordinate)
250
+ return expanded_frame
251
+
252
+
253
+ def load_image_from_base64(image):
254
+ return Image.open(BytesIO(base64.b64decode(image)))
255
+
256
+
257
+ def expand2square(pil_img, background_color):
258
+ width, height = pil_img.size
259
+ if width == height:
260
+ return pil_img
261
+ elif width > height:
262
+ result = Image.new(pil_img.mode, (width, width), background_color)
263
+ result.paste(pil_img, (0, (width - height) // 2))
264
+ return result
265
+ else:
266
+ result = Image.new(pil_img.mode, (height, height), background_color)
267
+ result.paste(pil_img, ((height - width) // 2, 0))
268
+ return result
269
+
270
+
271
+ def process_images(images, image_processor, model_cfg):
272
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
273
+ new_images = []
274
+ #print("Current image_aspect_ratio:", image_aspect_ratio)
275
+ if image_aspect_ratio == 'pad':
276
+ for image in images:
277
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
278
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
279
+ new_images.append(image)
280
+ else:
281
+ return image_processor(images, return_tensors='pt')['pixel_values']
282
+ if all(x.shape == new_images[0].shape for x in new_images):
283
+ new_images = torch.stack(new_images, dim=0)
284
+ return new_images
285
+
286
+
287
+ def process_videos(frames, image_processor, model_cfg):
288
+ # this function only used during inference
289
+ # image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
290
+ # new_frames = []
291
+ # print("Current image_aspect_ratio:", image_aspect_ratio)
292
+ # if image_aspect_ratio == 'pad':
293
+ # for image in frames:
294
+ # image = Image.fromarray(image)
295
+ # image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
296
+ # image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
297
+ # new_frames.append(image)
298
+ # else:
299
+ # return image_processor(frames, return_tensors='pt')['pixel_values']
300
+ # if all(x.shape == new_frames[0].shape for x in new_frames):
301
+ # new_frames = torch.stack(new_frames, dim=0)
302
+ new_frames = image_processor.preprocess(frames, return_tensors='pt')['pixel_values'] # do not pad for video frames
303
+ return new_frames
304
+
305
+
306
+ def create_photo_grid(arr, rows=None, cols=None):
307
+ """
308
+ Create a photo grid from a 4D numpy array with shape [t, h, w, c].
309
+
310
+ Parameters:
311
+ arr (numpy.ndarray): Input array with shape [t, h, w, c].
312
+ rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
313
+ cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
314
+
315
+ Returns:
316
+ numpy.ndarray: A 3D numpy array representing the photo grid.
317
+ """
318
+
319
+ if isinstance(arr, list):
320
+ if isinstance(arr[0], Image.Image):
321
+ arr = np.stack([np.array(img) for img in arr])
322
+ elif isinstance(arr[0], np.ndarray):
323
+ arr = np.stack(arr)
324
+ else:
325
+ raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
326
+
327
+ t, h, w, c = arr.shape
328
+
329
+ # Calculate the number of rows and columns if not provided
330
+ if rows is None and cols is None:
331
+ rows = math.ceil(math.sqrt(t))
332
+ cols = math.ceil(t / rows)
333
+ elif rows is None:
334
+ rows = math.ceil(t / cols)
335
+ elif cols is None:
336
+ cols = math.ceil(t / rows)
337
+
338
+ # Check if the grid can hold all the images
339
+ if rows * cols < t:
340
+ raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
341
+
342
+ # Create the grid array with appropriate height and width
343
+ grid_height = h * rows
344
+ grid_width = w * cols
345
+ grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
346
+
347
+ # Fill the grid with images
348
+ for i in range(t):
349
+ row_idx = i // cols
350
+ col_idx = i % cols
351
+ grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
352
+
353
+ return grid
354
+
355
+
356
+ def process_image(image_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False):
357
+ image = Image.open(image_path).convert('RGB')
358
+
359
+ if image_grid:
360
+ pg = np.stack([np.array(image)] * num_frames)
361
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
362
+ pg = create_photo_grid(pg, grid_h, grid_w)
363
+ images = [pg, np.array(image)]
364
+ else:
365
+ images = [np.array(image)]
366
+
367
+ if aspect_ratio == 'pad':
368
+ images = [Image.fromarray(f) for f in images]
369
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
370
+ else:
371
+ images = [Image.fromarray(f) for f in images]
372
+
373
+ images = processor.preprocess(images, return_tensors='pt')['pixel_values']
374
+ return images
375
+
376
+
377
+ def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
378
+ def frame_sample(duration, mode='uniform', local_fps=None):
379
+ if mode == 'uniform':
380
+ return np.linspace(0, duration-1, num_frames, dtype=int)
381
+ elif mode == 'fps':
382
+ assert local_fps is not None
383
+ segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
384
+ frame_id_list = np.arange(segment_len // 2, duration, segment_len, dtype=int)
385
+ if len(frame_id_list) < num_frames:
386
+ frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
387
+ return frame_id_list
388
+ else:
389
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
390
+
391
+ if isinstance(video_path, str):
392
+ if video_path.endswith('.gif'):
393
+ video_gif = imageio.get_reader(video_path)
394
+ duration, local_fps = len(video_gif), 10
395
+
396
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
397
+ # limit the max input frames
398
+ if len(frame_id_list) > MAX_FRAMES:
399
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
400
+ video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
401
+ # added by lixin4ever, include the support of .webm files from sthsthv2
402
+ elif video_path.endswith('.webm'):
403
+ video_webm = VideoFileClip(video_path)
404
+ video_frames = np.array(list(video_webm.iter_frames()))
405
+
406
+ duration, local_fps = len(video_frames), video_webm.fps
407
+
408
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
409
+ # limit the max input frames
410
+ if len(frame_id_list) > MAX_FRAMES:
411
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
412
+ video_data = video_frames[frame_id_list]
413
+ else:
414
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0)) if "Valley/finetune/source_videos" not in video_path else VideoReader(uri=video_path, ctx=cpu(0), num_threads=1) # add num_threads=1 for Valley videos
415
+ duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
416
+
417
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
418
+ # limit the max input frames
419
+ if len(frame_id_list) > MAX_FRAMES:
420
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
421
+ try:
422
+ video_data = decord_vr.get_batch(frame_id_list).numpy()
423
+ except:
424
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
425
+
426
+ # if self.data_args.use_temp_aug:
427
+ # frame_id_list = np.linspace(0, duration-1, num_frames * 2 * 2, dtype=int)
428
+ # video_data = decord_vr.get_batch(frame_id_list)
429
+ # video_frames = [Image.fromarray(f) for f in video_data.numpy()]
430
+ # chunked_video_frames = chunk_list(video_frames, 2*2)
431
+ # video_data = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
432
+ else:
433
+ video = video_path
434
+ frame_id_list = frame_sample(duration, mode='uniform')
435
+ video_data = [video.get_data(frame_id) for frame_id in frame_id_list]
436
+
437
+ if image_grid:
438
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
439
+ pg = create_photo_grid(video_data, grid_h, grid_w)
440
+ video_data = [pg, *video_data]
441
+
442
+ if aspect_ratio == 'pad':
443
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
444
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
445
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
446
+ else:
447
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
448
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
449
+
450
+ return video
451
+
452
+
453
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
454
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
455
+
456
+ def insert_separator(X, sep):
457
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
458
+
459
+ input_ids = []
460
+ offset = 0
461
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
462
+ offset = 1
463
+ input_ids.append(prompt_chunks[0][0])
464
+
465
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
466
+ input_ids.extend(x[offset:])
467
+
468
+ if return_tensors is not None:
469
+ if return_tensors == 'pt':
470
+ return torch.tensor(input_ids, dtype=torch.long)
471
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
472
+ return input_ids
473
+
474
+
475
+ def tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
476
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>')]
477
+ num_prompt_chunks = len(prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>'))
478
+
479
+ def insert_separator(X, sep):
480
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
481
+
482
+ input_ids = []
483
+ offset = 0
484
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
485
+ offset = 1
486
+ input_ids.append(prompt_chunks[0][0])
487
+
488
+ for x in insert_separator(prompt_chunks, [MMODAL_token_index] * (offset + 1)):
489
+ input_ids.extend(x[offset:])
490
+
491
+ if return_tensors is not None:
492
+ if return_tensors == 'pt':
493
+ return torch.tensor(input_ids, dtype=torch.long)
494
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
495
+ return input_ids
496
+
497
+
498
+ def get_model_name_from_path(model_path):
499
+ model_path = model_path.strip("/")
500
+ model_paths = model_path.split("/")
501
+ if model_paths[-1].startswith('checkpoint-'):
502
+ return model_paths[-2] + "_" + model_paths[-1]
503
+ else:
504
+ return model_paths[-1]
505
+
506
+
507
+ class KeywordsStoppingCriteria(StoppingCriteria):
508
+ def __init__(self, keywords, tokenizer, input_ids):
509
+ self.keywords = keywords
510
+ self.keyword_ids = []
511
+ self.max_keyword_len = 0
512
+ for keyword in keywords:
513
+ cur_keyword_ids = tokenizer(keyword).input_ids
514
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
515
+ cur_keyword_ids = cur_keyword_ids[1:]
516
+ if len(cur_keyword_ids) > self.max_keyword_len:
517
+ self.max_keyword_len = len(cur_keyword_ids)
518
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
519
+ self.tokenizer = tokenizer
520
+ self.start_len = input_ids.shape[1]
521
+
522
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
523
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
524
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
525
+ for keyword_id in self.keyword_ids:
526
+ if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
527
+ return True
528
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
529
+ for keyword in self.keywords:
530
+ if keyword in outputs:
531
+ return True
532
+ return False
533
+
534
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
535
+ outputs = []
536
+ for i in range(output_ids.shape[0]):
537
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
538
+ return all(outputs)
videollama2/model/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .language_model.videollama2_llama import Videollama2LlamaForCausalLM, Videollama2Config
2
+ from .language_model.videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
3
+ from .language_model.videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
videollama2/model/builder.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import os
18
+ import warnings
19
+ import shutil
20
+
21
+ import torch
22
+ from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
+
24
+ from . import *
25
+ from .multimodal_projector import load_mm_projector
26
+ from ..constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
27
+
28
+
29
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
30
+ if 'token' in kwargs:
31
+ token = kwargs['token']
32
+ else:
33
+ token = None
34
+
35
+ kwargs = {"device_map": device_map, **kwargs}
36
+
37
+ if device != "cuda":
38
+ kwargs['device_map'] = {"": device}
39
+
40
+ if load_8bit:
41
+ kwargs['load_in_8bit'] = True
42
+ elif load_4bit:
43
+ kwargs['load_in_4bit'] = True
44
+ kwargs['quantization_config'] = BitsAndBytesConfig(
45
+ load_in_4bit=True,
46
+ bnb_4bit_compute_dtype=torch.float16,
47
+ bnb_4bit_use_double_quant=True,
48
+ bnb_4bit_quant_type='nf4'
49
+ )
50
+ else:
51
+ kwargs['torch_dtype'] = torch.float16
52
+
53
+ if use_flash_attn:
54
+ kwargs['attn_implementation'] = 'flash_attention_2'
55
+
56
+ if "videollama" in model_name.lower():
57
+ # Load LLaVA model
58
+ if 'lora' in model_name.lower() and model_base is None:
59
+ warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
60
+ if 'lora' in model_name.lower() and model_base is not None:
61
+ lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
62
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
63
+ print('Loading VideoLLaMA from base model...')
64
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
65
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
66
+ if model.lm_head.weight.shape[0] != token_num:
67
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
68
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
69
+
70
+ print('Loading additional VideoLLaMA weights...')
71
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
72
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
73
+ else:
74
+ # this is probably from HF Hub
75
+ from huggingface_hub import hf_hub_download
76
+ def load_from_hf(repo_id, filename, subfolder=None):
77
+ cache_file = hf_hub_download(
78
+ repo_id=repo_id,
79
+ filename=filename,
80
+ subfolder=subfolder)
81
+ return torch.load(cache_file, map_location='cpu')
82
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
83
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
84
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
85
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
86
+ model.load_state_dict(non_lora_trainables, strict=False)
87
+
88
+ from peft import PeftModel
89
+ print('Loading LoRA weights...')
90
+ model = PeftModel.from_pretrained(model, model_path)
91
+ print('Merging LoRA weights...')
92
+ model = model.merge_and_unload()
93
+ print('Model is loaded...')
94
+ elif model_base is not None or '-base' in model_name.lower():
95
+ # loading vision-language projector
96
+ print('Loading VideoLLaMA 2 from base model...')
97
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
98
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
99
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
100
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
101
+
102
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
103
+
104
+ if 'vicuna' in model_name.lower():
105
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
106
+ elif 'mixtral' in model_name.lower():
107
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
108
+ else:
109
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
110
+
111
+ # NOTE: old codes for loading local mm_projector.bin
112
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
113
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
114
+ # model.load_state_dict(mm_projector_weights, strict=False)
115
+ # NOTE: new codes which supports loading mm_projector.bin both offline and online
116
+ mm_projector_weights = load_mm_projector(model_path, token=token)
117
+ model.load_state_dict(mm_projector_weights, strict=False)
118
+ else:
119
+ if 'vicuna' in model_name.lower():
120
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
121
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
122
+ elif 'mixtral' in model_name.lower():
123
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
124
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
125
+ else:
126
+ # NOTE: mistral-based model is our default model.
127
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
128
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
129
+ else:
130
+ # Load language model
131
+ if model_base is not None:
132
+ # PEFT model
133
+ from peft import PeftModel
134
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
135
+ model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
136
+ print(f"Loading LoRA weights from {model_path}")
137
+ model = PeftModel.from_pretrained(model, model_path)
138
+ print(f"Merging weights")
139
+ model = model.merge_and_unload()
140
+ print('Convert to FP16...')
141
+ model.to(torch.float16)
142
+ else:
143
+ use_fast = False
144
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
145
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
146
+
147
+ processor = None
148
+
149
+ if "videollama" in model_name.lower():
150
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
151
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
152
+ if mm_use_im_patch_token:
153
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
154
+ if mm_use_im_start_end:
155
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
156
+ model.resize_token_embeddings(len(tokenizer))
157
+
158
+ vision_tower = model.get_vision_tower()
159
+ if not vision_tower.is_loaded:
160
+ vision_tower.load_model()
161
+ vision_tower.to(device=device, dtype=torch.float16)
162
+ # NOTE: videollama2 adopts the same processor for processing image and video.
163
+ processor = vision_tower.image_processor
164
+
165
+ if hasattr(model.config, "max_sequence_length"):
166
+ context_len = model.config.max_sequence_length
167
+ else:
168
+ context_len = 2048
169
+
170
+ return tokenizer, model, processor, context_len
videollama2/model/language_model/videollama2_llama.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaModel, LlamaForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2Config(LlamaConfig):
31
+ model_type = "videollama2_llama"
32
+
33
+
34
+ class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
35
+ config_class = Videollama2Config
36
+
37
+ def __init__(self, config: LlamaConfig):
38
+ super(Videollama2LlamaModel, self).__init__(config)
39
+
40
+
41
+ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
42
+ config_class = Videollama2Config
43
+
44
+ def __init__(self, config, **kwargs):
45
+ super(LlamaForCausalLM, self).__init__(config)
46
+ self.model = Videollama2LlamaModel(config)
47
+ self.pretraining_tp = config.pretraining_tp
48
+ self.vocab_size = config.vocab_size
49
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def get_model(self):
55
+ return self.model
56
+
57
+ def forward(
58
+ self,
59
+ input_ids: torch.LongTensor = None,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
63
+ inputs_embeds: Optional[torch.FloatTensor] = None,
64
+ labels: Optional[torch.LongTensor] = None,
65
+ use_cache: Optional[bool] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ images: Optional[torch.FloatTensor] = None,
69
+ return_dict: Optional[bool] = None,
70
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
71
+
72
+ if inputs_embeds is None:
73
+ (
74
+ input_ids,
75
+ attention_mask,
76
+ past_key_values,
77
+ inputs_embeds,
78
+ labels
79
+ ) = self.prepare_inputs_labels_for_multimodal(
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ labels,
84
+ images
85
+ )
86
+
87
+ return super().forward(
88
+ input_ids=input_ids,
89
+ attention_mask=attention_mask,
90
+ past_key_values=past_key_values,
91
+ inputs_embeds=inputs_embeds,
92
+ labels=labels,
93
+ use_cache=use_cache,
94
+ output_attentions=output_attentions,
95
+ output_hidden_states=output_hidden_states,
96
+ return_dict=return_dict
97
+ )
98
+
99
+ @torch.no_grad()
100
+ def generate(
101
+ self,
102
+ inputs: Optional[torch.Tensor] = None,
103
+ images_or_videos: Optional[torch.Tensor] = None,
104
+ modal_list: Optional[torch.Tensor] = None,
105
+ **kwargs,
106
+ ) -> Union[GenerateOutput, torch.LongTensor]:
107
+ position_ids = kwargs.pop("position_ids", None)
108
+ attention_mask = kwargs.pop("attention_mask", None)
109
+ if "inputs_embeds" in kwargs:
110
+ raise NotImplementedError("`inputs_embeds` is not supported")
111
+
112
+ if images_or_videos is not None:
113
+ (
114
+ input_ids,
115
+ attention_mask,
116
+ past_key_values,
117
+ inputs_embeds,
118
+ _
119
+ ) = self.prepare_inputs_labels_for_multimodal(
120
+ input_ids=inputs,
121
+ attention_mask=attention_mask,
122
+ past_key_values=None,
123
+ labels=None,
124
+ X_modalities=[images_or_videos, modal_list]
125
+ )
126
+ else:
127
+ inputs_embeds = self.get_model().embed_tokens(inputs)
128
+
129
+ return super().generate(
130
+ position_ids=position_ids,
131
+ attention_mask=attention_mask,
132
+ inputs_embeds=inputs_embeds,
133
+ **kwargs
134
+ )
135
+
136
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
137
+ images = kwargs.pop("images", None)
138
+ _inputs = super().prepare_inputs_for_generation(
139
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
140
+ )
141
+ if images is not None:
142
+ _inputs['images'] = images
143
+ return _inputs
144
+
145
+
146
+ AutoConfig.register("videollama2_llama", Videollama2Config)
147
+ AutoModelForCausalLM.register(Videollama2Config, Videollama2LlamaForCausalLM)
videollama2/model/language_model/videollama2_mistral.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, \
24
+ MistralConfig, MistralModel, MistralForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2MistralConfig(MistralConfig):
33
+ model_type = "videollama2_mistral"
34
+
35
+
36
+ class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
37
+ config_class = Videollama2MistralConfig
38
+
39
+ def __init__(self, config: MistralConfig):
40
+ super(Videollama2MistralModel, self).__init__(config)
41
+
42
+
43
+ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausalLM):
44
+ config_class = Videollama2MistralConfig
45
+
46
+ def __init__(self, config, **kwargs):
47
+ super(MistralForCausalLM, self).__init__(config)
48
+ self.model = Videollama2MistralModel(config)
49
+ # self.pretraining_tp = config.pretraining_tp
50
+ self.vocab_size = config.vocab_size
51
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
52
+
53
+ # Initialize weights and apply final processing
54
+ self.post_init()
55
+
56
+ def get_model(self):
57
+ return self.model
58
+
59
+ def forward(
60
+ self,
61
+ input_ids: torch.LongTensor = None,
62
+ attention_mask: Optional[torch.Tensor] = None,
63
+ position_ids: Optional[torch.LongTensor] = None,
64
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
65
+ inputs_embeds: Optional[torch.FloatTensor] = None,
66
+ labels: Optional[torch.LongTensor] = None,
67
+ use_cache: Optional[bool] = None,
68
+ output_attentions: Optional[bool] = None,
69
+ output_hidden_states: Optional[bool] = None,
70
+ images: Optional[torch.FloatTensor] = None,
71
+ return_dict: Optional[bool] = None,
72
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
73
+
74
+ if inputs_embeds is None:
75
+ (
76
+ input_ids,
77
+ attention_mask,
78
+ past_key_values,
79
+ inputs_embeds,
80
+ labels
81
+ ) = self.prepare_inputs_labels_for_multimodal(
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ labels,
86
+ images
87
+ )
88
+
89
+ return super().forward(
90
+ input_ids=input_ids,
91
+ attention_mask=attention_mask,
92
+ past_key_values=past_key_values,
93
+ inputs_embeds=inputs_embeds,
94
+ labels=labels,
95
+ use_cache=use_cache,
96
+ output_attentions=output_attentions,
97
+ output_hidden_states=output_hidden_states,
98
+ return_dict=return_dict
99
+ )
100
+
101
+ @torch.no_grad()
102
+ def generate(
103
+ self,
104
+ inputs: Optional[torch.Tensor] = None,
105
+ images_or_videos: Optional[torch.Tensor] = None,
106
+ modal_list: Optional[torch.Tensor] = None,
107
+ **kwargs,
108
+ ) -> Union[GenerateOutput, torch.LongTensor]:
109
+ position_ids = kwargs.pop("position_ids", None)
110
+ attention_mask = kwargs.pop("attention_mask", None)
111
+ if "inputs_embeds" in kwargs:
112
+ raise NotImplementedError("`inputs_embeds` is not supported")
113
+
114
+ if images_or_videos is not None:
115
+ (
116
+ input_ids,
117
+ attention_mask,
118
+ past_key_values,
119
+ inputs_embeds,
120
+ _
121
+ ) = self.prepare_inputs_labels_for_multimodal(
122
+ input_ids=inputs,
123
+ attention_mask=attention_mask,
124
+ past_key_values=None,
125
+ labels=None,
126
+ X_modalities=[images_or_videos, modal_list]
127
+ )
128
+ else:
129
+ inputs_embeds = self.get_model().embed_tokens(inputs)
130
+
131
+ return super().generate(
132
+ position_ids=position_ids,
133
+ attention_mask=attention_mask,
134
+ inputs_embeds=inputs_embeds,
135
+ **kwargs
136
+ )
137
+
138
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
139
+ images = kwargs.pop("images", None)
140
+ _inputs = super().prepare_inputs_for_generation(
141
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
142
+ )
143
+ if images is not None:
144
+ _inputs['images'] = images
145
+ return _inputs
146
+
147
+
148
+ AutoConfig.register("videollama2_mistral", Videollama2MistralConfig)
149
+ AutoModelForCausalLM.register(Videollama2MistralConfig, Videollama2MistralForCausalLM)
videollama2/model/language_model/videollama2_mixtral.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MixtralConfig, MixtralModel, MixtralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
29
+
30
+
31
+ class Videollama2MixtralConfig(MixtralConfig):
32
+ model_type = "videollama2_mixtral"
33
+
34
+
35
+ class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
36
+ config_class = Videollama2MixtralConfig
37
+
38
+ def __init__(self, config: MixtralConfig):
39
+ super(Videollama2MixtralModel, self).__init__(config)
40
+
41
+
42
+ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausalLM):
43
+ config_class = Videollama2MixtralConfig
44
+
45
+ def __init__(self, config, **kwargs):
46
+ super(MixtralForCausalLM, self).__init__(config)
47
+ self.model = Videollama2MixtralModel(config)
48
+ # self.pretraining_tp = config.pretraining_tp
49
+ self.vocab_size = config.vocab_size
50
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
51
+
52
+ # Initialize weights and apply final processing
53
+ self.post_init()
54
+
55
+ def get_model(self):
56
+ return self.model
57
+
58
+ def forward(
59
+ self,
60
+ input_ids: torch.LongTensor = None,
61
+ attention_mask: Optional[torch.Tensor] = None,
62
+ position_ids: Optional[torch.LongTensor] = None,
63
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
64
+ inputs_embeds: Optional[torch.FloatTensor] = None,
65
+ labels: Optional[torch.LongTensor] = None,
66
+ use_cache: Optional[bool] = None,
67
+ output_attentions: Optional[bool] = None,
68
+ output_hidden_states: Optional[bool] = None,
69
+ images: Optional[torch.FloatTensor] = None,
70
+ return_dict: Optional[bool] = None,
71
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
72
+
73
+ if inputs_embeds is None:
74
+ (
75
+ input_ids,
76
+ attention_mask,
77
+ past_key_values,
78
+ inputs_embeds,
79
+ labels
80
+ ) = self.prepare_inputs_labels_for_multimodal(
81
+ input_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ labels,
85
+ images
86
+ )
87
+
88
+ return super().forward(
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask,
91
+ past_key_values=past_key_values,
92
+ inputs_embeds=inputs_embeds,
93
+ labels=labels,
94
+ use_cache=use_cache,
95
+ output_attentions=output_attentions,
96
+ output_hidden_states=output_hidden_states,
97
+ return_dict=return_dict
98
+ )
99
+
100
+ @torch.no_grad()
101
+ def generate(
102
+ self,
103
+ inputs: Optional[torch.Tensor] = None,
104
+ images_or_videos: Optional[torch.Tensor] = None,
105
+ timestamps: Optional[torch.Tensor] = None,
106
+ modal_list: Optional[torch.Tensor] = None,
107
+ **kwargs,
108
+ ) -> Union[GenerateOutput, torch.LongTensor]:
109
+ position_ids = kwargs.pop("position_ids", None)
110
+ attention_mask = kwargs.pop("attention_mask", None)
111
+ if "inputs_embeds" in kwargs:
112
+ raise NotImplementedError("`inputs_embeds` is not supported")
113
+
114
+ if images_or_videos is not None:
115
+ X_modalities = [images_or_videos, modal_list] if timestamps is None else [images_or_videos, modal_list, timestamps]
116
+ (
117
+ input_ids,
118
+ attention_mask,
119
+ past_key_values,
120
+ inputs_embeds,
121
+ _
122
+ ) = self.prepare_inputs_labels_for_multimodal(
123
+ input_ids=inputs,
124
+ attention_mask=attention_mask,
125
+ past_key_values=None,
126
+ labels=None,
127
+ X_modalities=X_modalities
128
+ )
129
+ else:
130
+ inputs_embeds = self.get_model().embed_tokens(inputs)
131
+
132
+ return super().generate(
133
+ position_ids=position_ids,
134
+ attention_mask=attention_mask,
135
+ inputs_embeds=inputs_embeds,
136
+ **kwargs
137
+ )
138
+
139
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
140
+ images = kwargs.pop("images", None)
141
+ _inputs = super().prepare_inputs_for_generation(
142
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
143
+ )
144
+ if images is not None:
145
+ _inputs['images'] = images
146
+ return _inputs
147
+
148
+ AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
149
+ AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
videollama2/model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from .clip_encoder import CLIPVisionTower
4
+
5
+
6
+ def build_vision_tower(vision_tower_cfg, **kwargs):
7
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
8
+
9
+ is_absolute_path_exists = os.path.exists(vision_tower)
10
+ if vision_tower.startswith("openai") or vision_tower.startswith("laion") or 'clip' in vision_tower:
11
+ vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12
+ else:
13
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
14
+
15
+ return vision_tower
videollama2/model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
5
+
6
+
7
+ class CLIPVisionTower(nn.Module):
8
+
9
+ def __init__(self, vision_tower, args, delay_load=False):
10
+ super().__init__()
11
+
12
+ self.is_loaded = False
13
+
14
+ self.vision_tower_name = vision_tower
15
+ self.select_layer = args.mm_vision_select_layer
16
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
17
+
18
+ if not delay_load:
19
+ self.load_model()
20
+ else:
21
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
22
+
23
+ def load_model(self):
24
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
25
+
26
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
27
+ self.vision_tower.requires_grad_(False)
28
+
29
+ self.is_loaded = True
30
+
31
+ def feature_select(self, image_forward_outs):
32
+ image_features = image_forward_outs.hidden_states[self.select_layer]
33
+ if self.select_feature == 'patch':
34
+ image_features = image_features[:, 1:]
35
+ elif self.select_feature == 'cls_patch':
36
+ image_features = image_features
37
+ else:
38
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
39
+ return image_features
40
+
41
+ @torch.no_grad()
42
+ def forward(self, images):
43
+ if type(images) is list:
44
+ image_features = []
45
+ for image in images:
46
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
47
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
48
+ image_features.append(image_feature)
49
+ else:
50
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
51
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
52
+
53
+ return image_features
54
+
55
+ @property
56
+ def dummy_feature(self):
57
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
58
+
59
+ @property
60
+ def dtype(self):
61
+ return self.vision_tower.dtype
62
+
63
+ @property
64
+ def device(self):
65
+ return self.vision_tower.device
66
+
67
+ @property
68
+ def config(self):
69
+ if self.is_loaded:
70
+ return self.vision_tower.config
71
+ else:
72
+ return self.cfg_only
73
+
74
+ @property
75
+ def hidden_size(self):
76
+ return self.config.hidden_size
77
+
78
+ @property
79
+ def num_patches(self):
80
+ return (self.config.image_size // self.config.patch_size) ** 2
81
+
82
+ @property
83
+ def num_patches_per_side(self):
84
+ return self.config.image_size // self.config.patch_size
videollama2/model/multimodal_projector/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .builder import load_mm_projector
videollama2/model/multimodal_projector/builder.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alibaba DAMO Academy
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import einops
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from timm.models.regnet import RegStage
23
+ from timm.models.layers import LayerNorm2d
24
+ from transformers import TRANSFORMERS_CACHE
25
+
26
+
27
+ def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
28
+ revision = "main"
29
+ # 1. parse the downloaded cache folder
30
+ if cache_dir is None:
31
+ cache_dir = TRANSFORMERS_CACHE
32
+ else:
33
+ cache_dir = cache_dir
34
+ object_id = repo_id.replace("/", "--")
35
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
36
+ # 2. resolve refs (for instance to convert main to the associated commit sha)
37
+ refs_dir = os.path.join(repo_cache, "refs")
38
+ if os.path.isdir(refs_dir):
39
+ revision_file = os.path.join(refs_dir, revision)
40
+ if os.path.isfile(revision_file):
41
+ with open(revision_file) as f:
42
+ revision = f.read()
43
+ # 3. acquire the snapshot folder
44
+ folder = os.path.join(repo_cache, "snapshots", revision)
45
+
46
+ return folder
47
+
48
+
49
+ def load_mm_projector(model_path, cache_dir=None, token=None):
50
+ if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
51
+ is_local = True
52
+ folder = model_path
53
+ else:
54
+ is_local = False
55
+ folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
56
+ if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
57
+ # downloading from remote repo
58
+ from huggingface_hub import snapshot_download
59
+ snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
60
+
61
+ mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
62
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
63
+ return mm_projector_weights
64
+
65
+
66
+ class IdentityMap(nn.Module):
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ def forward(self, x, *args, **kwargs):
72
+ return x
73
+
74
+ @property
75
+ def config(self):
76
+ return {"mm_projector_type": 'identity'}
77
+
78
+
79
+ class SimpleResBlock(nn.Module):
80
+
81
+ def __init__(self, channels):
82
+ super().__init__()
83
+ self.pre_norm = nn.LayerNorm(channels)
84
+
85
+ self.proj = nn.Sequential(
86
+ nn.Linear(channels, channels),
87
+ nn.GELU(),
88
+ nn.Linear(channels, channels)
89
+ )
90
+ def forward(self, x):
91
+ x = self.pre_norm(x)
92
+ return x + self.proj(x)
93
+
94
+
95
+ def build_vision_projector(config, delay_load=False, **kwargs):
96
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
97
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
98
+ if mlp_gelu_match:
99
+ mlp_depth = int(mlp_gelu_match.group(1))
100
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
101
+ for _ in range(1, mlp_depth):
102
+ modules.append(nn.GELU())
103
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
104
+ return nn.Sequential(*modules)
105
+
106
+ if projector_type == "linear":
107
+ # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
108
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
109
+ elif projector_type == "stc_connector":
110
+ return STCConnector(config)
111
+ elif projector_type == "stp_connector":
112
+ return STPConnector(config)
113
+ elif projector_type == "stc_connector_v35":
114
+ return STCConnectorV35(config)
115
+ elif projector_type == "spatial_conv":
116
+ return SpatialConv(config)
117
+ elif projector_type == "spatial_pool":
118
+ return SpatialPool(config)
119
+ if projector_type == 'identity':
120
+ return IdentityMap()
121
+
122
+ raise ValueError(f'Unknown projector type: {projector_type}')
123
+
124
+
125
+ def build_mlp(depth, hidden_size, output_hidden_size):
126
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
127
+ for _ in range(1, depth):
128
+ modules.append(nn.GELU())
129
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
130
+ return nn.Sequential(*modules)
131
+
132
+
133
+ class STCConnector(nn.Module):
134
+
135
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
136
+ """Temporal Convolutional Vision-Language Connector.
137
+
138
+ Args:
139
+ config: config object.
140
+ downsample: (temporal, height, width) downsample rate.
141
+ depth: depth of the spatial interaction blocks.
142
+ mlp_depth: depth of the vision-language projector layers.
143
+ """
144
+ super().__init__()
145
+ self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
146
+ self.hidden_size = hidden_size = config.hidden_size
147
+ self.output_hidden_size = output_hidden_size = config.hidden_size
148
+ # TODO: make these as config arguments
149
+ self.depth = depth
150
+ self.mlp_depth = mlp_depth
151
+ self.downsample = downsample
152
+ if depth != 0:
153
+ self.s1 = RegStage(
154
+ depth=depth,
155
+ in_chs=encoder_hidden_size,
156
+ out_chs=hidden_size,
157
+ stride=1,
158
+ dilation=1,
159
+ act_layer=nn.SiLU,
160
+ norm_layer=LayerNorm2d,
161
+ )
162
+ else:
163
+ self.s1 = nn.Identity()
164
+ self.sampler = nn.Sequential(
165
+ nn.Conv3d(
166
+ in_channels=hidden_size,
167
+ out_channels=hidden_size,
168
+ kernel_size=downsample,
169
+ stride=downsample,
170
+ padding=1,
171
+ bias=True
172
+ ),
173
+ nn.SiLU()
174
+ )
175
+ if depth != 0:
176
+ self.s2 = RegStage(
177
+ depth=depth,
178
+ in_chs=hidden_size,
179
+ out_chs=hidden_size,
180
+ stride=1,
181
+ dilation=1,
182
+ act_layer=nn.SiLU,
183
+ norm_layer=LayerNorm2d,
184
+ )
185
+ else:
186
+ self.s2 = nn.Identity()
187
+ self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
188
+
189
+ def forward(self, x):
190
+ """Aggregate tokens on the temporal and spatial dimensions.
191
+ Args:
192
+ x: input tokens [b, t, h, w, d] / [b, t, l, d]
193
+ Returns:
194
+ aggregated tokens [b, l, d]
195
+ """
196
+ t = x.size(1)
197
+ if x.ndim == 4:
198
+ hw = int(x.size(2) ** 0.5)
199
+ x = einops.rearrange(x, "b t (h w) d -> b d t h w", h=hw, w=hw)
200
+ elif x.ndim == 5:
201
+ x = einops.rearrange(x, "b t h w d -> b d t h w")
202
+
203
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
204
+ # 1. the first stage of the adapter
205
+ x = self.s1(x)
206
+ x = einops.rearrange(x, "(b t) d h w -> b d t h w", t=t)
207
+ # 2. downsampler
208
+ x = self.sampler(x)
209
+ new_t = x.size(2)
210
+ # 3. the second stage of the adapter
211
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
212
+ x = self.s2(x)
213
+ x = einops.rearrange(x, "(b t) d h w -> b (t h w) d", t=new_t)
214
+ x = self.readout(x)
215
+ return x
216
+
217
+
218
+ class STPConnector(STCConnector):
219
+
220
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
221
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
222
+ self.sampler = nn.Sequential(nn.AvgPool3d(downsample), nn.SiLU())
223
+
224
+
225
+ class STCConnectorV35(STCConnector):
226
+
227
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
228
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
229
+ self.sampler = nn.Sequential(
230
+ nn.Conv3d(
231
+ in_channels=self.hidden_size,
232
+ out_channels=self.hidden_size,
233
+ kernel_size=downsample,
234
+ stride=downsample,
235
+ padding=0,
236
+ bias=True
237
+ ),
238
+ nn.SiLU())
239
+
240
+
241
+ class SpatialConv(STCConnector):
242
+
243
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
244
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
245
+
246
+
247
+ class SpatialPool(STPConnector):
248
+
249
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
250
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
videollama2/model/videollama2_arch.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+
19
+ import einops
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from .multimodal_encoder.builder import build_vision_tower
24
+ from .multimodal_projector.builder import build_vision_projector
25
+ from ..mm_utils import get_anyres_image_grid_shape
26
+ from ..constants import NUM_FRAMES, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN,DEFAULT_MMODAL_PATCH_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
27
+
28
+
29
+ class Videollama2MetaModel:
30
+
31
+ def __init__(self, config):
32
+ super(Videollama2MetaModel, self).__init__(config)
33
+
34
+ if hasattr(config, "mm_vision_tower"):
35
+ self.vision_tower = build_vision_tower(config, delay_load=True)
36
+ self.mm_projector = build_vision_projector(config)
37
+
38
+ def get_vision_tower(self):
39
+ vision_tower = getattr(self, 'vision_tower', None)
40
+ if type(vision_tower) is list:
41
+ vision_tower = vision_tower[0]
42
+ return vision_tower
43
+
44
+ def initialize_vision_modules(self, model_args, fsdp=None):
45
+ vision_tower = model_args.vision_tower
46
+ mm_vision_select_layer = model_args.mm_vision_select_layer
47
+ mm_vision_select_feature = model_args.mm_vision_select_feature
48
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
49
+
50
+ self.config.mm_vision_tower = vision_tower
51
+
52
+ if self.get_vision_tower() is None:
53
+ vision_tower = build_vision_tower(model_args)
54
+
55
+ if fsdp is not None and len(fsdp) > 0:
56
+ self.vision_tower = [vision_tower]
57
+ else:
58
+ self.vision_tower = vision_tower
59
+ else:
60
+ if fsdp is not None and len(fsdp) > 0:
61
+ vision_tower = self.vision_tower[0]
62
+ else:
63
+ vision_tower = self.vision_tower
64
+ vision_tower.load_model()
65
+
66
+ self.config.use_mm_proj = True
67
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
68
+ self.config.mm_hidden_size = vision_tower.hidden_size
69
+ self.config.mm_vision_select_layer = mm_vision_select_layer
70
+ self.config.mm_vision_select_feature = mm_vision_select_feature
71
+
72
+ if getattr(self, 'mm_projector', None) is None:
73
+ self.mm_projector = build_vision_projector(self.config)
74
+ else:
75
+ # In case it is frozen by LoRA
76
+ for p in self.mm_projector.parameters():
77
+ p.requires_grad = True
78
+
79
+ if pretrain_mm_mlp_adapter is not None:
80
+ if os.path.exists(pretrain_mm_mlp_adapter):
81
+ is_local = True
82
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
83
+ else:
84
+ # Support loading projector weights from remote HuggingFace model hub
85
+ is_local = False
86
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.replace('mm_projector.bin', '')
87
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.strip('/').strip('\\').strip()
88
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
89
+
90
+ def get_w(weights, keyword):
91
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
92
+
93
+ # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
94
+ # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
95
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
96
+
97
+
98
+ class Videollama2MetaForCausalLM(ABC):
99
+
100
+ @abstractmethod
101
+ def get_model(self):
102
+ pass
103
+
104
+ def num_frames(self):
105
+ if hasattr(self.config, 'num_frames'):
106
+ return self.config.num_frames
107
+ else:
108
+ return NUM_FRAMES
109
+
110
+ def get_vision_tower(self):
111
+ return self.get_model().get_vision_tower()
112
+
113
+ def encode_images_or_videos(self, images_or_videos, modalities):
114
+ num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
115
+
116
+ videos = [x.unsqueeze(0).expand(num_frames, -1, -1, -1) if modal == 'image' else x for x, modal in zip(images_or_videos, modalities)]
117
+ videos = torch.stack(videos, dim=0)
118
+
119
+ assert len(videos.size()) == 5
120
+ batch_size = videos.size(0)
121
+
122
+ frames = einops.rearrange(videos, 'b t c h w -> (b t) c h w')
123
+ frames_features = self.get_model().get_vision_tower()(frames)
124
+ frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
125
+
126
+ return self.temporal_aggregator(frames_features)
127
+
128
+ def temporal_aggregator(self, frames_features):
129
+ """Temporal aggregation of frame features.
130
+ Args:
131
+ frames_features (torch.Tensor): Frame features with shape (b, t, n, h).
132
+ Returns:
133
+ torch.Tensor: Video features with shape (b, n, h).
134
+ """
135
+ # TODO: improve the merging method.
136
+ # *********** mean pooling *************
137
+ if self.config.mm_projector_type == "mlp2x_gelu" or self.config.mm_projector_type == "linear":
138
+ video_features = self.get_model().mm_projector(frames_features.mean(1))
139
+ # *********** spatial convolution *************
140
+ elif self.config.mm_projector_type == "spatial_conv":
141
+ video_features = self.get_model().mm_projector(frames_features)
142
+ # *********** spatial pooling *************
143
+ elif self.config.mm_projector_type == "spatial_pool":
144
+ video_features = self.get_model().mm_projector(frames_features)
145
+ # *********** time ************
146
+ elif "tc_connector" in self.config.mm_projector_type or "tp_connector" in self.config.mm_projector_type:
147
+ video_features = self.get_model().mm_projector(frames_features)
148
+ else:
149
+ raise Exception(f"Unsupported projector type {self.config.mm_projector_type}!!!")
150
+
151
+ return video_features
152
+
153
+ def prepare_inputs_labels_for_multimodal(
154
+ self, input_ids, attention_mask, past_key_values, labels, X_modalities
155
+ ):
156
+ vision_tower = self.get_vision_tower()
157
+ # NOTE: text-only situation
158
+ if vision_tower is None or X_modalities is None or input_ids.shape[1] == 1:
159
+ # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
160
+ # attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
161
+ return input_ids, attention_mask, past_key_values, None, labels
162
+
163
+ Xs, keys = X_modalities
164
+ X_features = self.encode_images_or_videos(Xs, keys)
165
+
166
+ new_input_embeds = []
167
+ new_labels = [] if labels is not None else None
168
+ cur_X_idx = 0
169
+ # replace image/video/audio tokens with pre-computed embeddings
170
+ for batch_idx, cur_input_ids in enumerate(input_ids):
171
+ # cur_X_features = X_features[batch_idx]
172
+ if (torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0)).sum() == 0:
173
+ half_len = cur_input_ids.shape[0] // 2
174
+ cur_X_features = X_features[cur_X_idx]
175
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
176
+ cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
177
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_X_features[0:0], cur_input_embeds_2], dim=0)
178
+ new_input_embeds.append(cur_input_embeds)
179
+ if labels is not None:
180
+ new_labels.append(labels[batch_idx])
181
+ cur_X_idx += 1
182
+ continue
183
+
184
+ X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0]
185
+ cur_new_input_embeds = []
186
+ if labels is not None:
187
+ cur_labels = labels[batch_idx]
188
+ cur_new_labels = []
189
+ assert cur_labels.shape == cur_input_ids.shape
190
+
191
+ # X_index_inonesample = 0
192
+ while X_token_indices.numel() > 0:
193
+ cur_X_features = X_features[cur_X_idx]
194
+ X_token_start = X_token_indices[0]
195
+
196
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:X_token_start]))
197
+ cur_new_input_embeds.append(cur_X_features)
198
+ if labels is not None:
199
+ cur_new_labels.append(cur_labels[:X_token_start])
200
+ cur_new_labels.append(torch.full((cur_X_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
201
+ cur_labels = cur_labels[X_token_start+1:]
202
+
203
+ cur_X_idx += 1
204
+ cur_input_ids = cur_input_ids[X_token_start+1:]
205
+ X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0]
206
+
207
+ if cur_input_ids.numel() > 0:
208
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
209
+ if labels is not None:
210
+ cur_new_labels.append(cur_labels)
211
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
212
+ # NOTE: one cur_new_input_embeds per each
213
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
214
+ new_input_embeds.append(cur_new_input_embeds)
215
+ if labels is not None:
216
+ cur_new_labels = torch.cat(cur_new_labels, dim=0)
217
+ new_labels.append(cur_new_labels)
218
+
219
+ # padding
220
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
221
+ max_len = max(x.shape[0] for x in new_input_embeds)
222
+
223
+ new_input_embeds_align = []
224
+ for cur_new_embed in new_input_embeds:
225
+ cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
226
+ new_input_embeds_align.append(cur_new_embed)
227
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
228
+
229
+ if labels is not None:
230
+ new_labels_align = []
231
+ _new_labels = new_labels
232
+ for cur_new_label in new_labels:
233
+ cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
234
+ new_labels_align.append(cur_new_label)
235
+ new_labels = torch.stack(new_labels_align, dim=0)
236
+
237
+ if attention_mask is not None:
238
+ new_attention_mask = []
239
+ for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
240
+ new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
241
+ new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
242
+ cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
243
+ new_attention_mask.append(cur_new_attention_mask)
244
+ attention_mask = torch.stack(new_attention_mask, dim=0)
245
+ assert attention_mask.shape == new_labels.shape
246
+ else:
247
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
248
+ if labels is not None:
249
+ new_labels = torch.stack(new_labels, dim=0)
250
+
251
+ if attention_mask is not None:
252
+ new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
253
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
254
+ assert attention_mask.shape == new_input_embeds.shape[:2]
255
+
256
+ return None, attention_mask, past_key_values, new_input_embeds, new_labels
257
+
258
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
259
+ if model_args.mm_use_im_patch_token:
260
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
261
+ self.resize_token_embeddings(len(tokenizer))
262
+
263
+ if model_args.mm_use_im_start_end:
264
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
265
+ self.resize_token_embeddings(len(tokenizer))
266
+
267
+ if num_new_tokens > 0:
268
+ input_embeddings = self.get_input_embeddings().weight.data
269
+ output_embeddings = self.get_output_embeddings().weight.data
270
+
271
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
272
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
273
+
274
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
275
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
276
+
277
+ if model_args.tune_mm_mlp_adapter:
278
+ for p in self.get_input_embeddings().parameters():
279
+ p.requires_grad = True
280
+ for p in self.get_output_embeddings().parameters():
281
+ p.requires_grad = False
282
+
283
+ if model_args.pretrain_mm_mlp_adapter:
284
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
285
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
286
+ assert num_new_tokens == 2
287
+ if input_embeddings.shape == embed_tokens_weight.shape:
288
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
289
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
290
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
291
+ else:
292
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
293
+ elif model_args.mm_use_im_patch_token:
294
+ if model_args.tune_mm_mlp_adapter:
295
+ for p in self.get_input_embeddings().parameters():
296
+ p.requires_grad = False
297
+ for p in self.get_output_embeddings().parameters():
298
+ p.requires_grad = False
299
+
300
+ def initialize_MM_tokenizer(self, model_args, tokenizer):
301
+ if model_args.mm_use_im_patch_token:
302
+ for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
303
+ tokenizer.add_tokens([DEFAULT_MMODAL_PATCH_TOKEN[modal.upper()]], special_tokens=True)
304
+ # tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
305
+ self.resize_token_embeddings(len(tokenizer))
306
+
307
+ if model_args.mm_use_im_start_end:
308
+ num_new_tokens = 0
309
+ for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
310
+ num_new_tokens += tokenizer.add_tokens([DEFAULT_MMODAL_START_TOKEN[modal.upper()], DEFAULT_MMODAL_END_TOKEN[modal.upper()]], special_tokens=True)
311
+ self.resize_token_embeddings(len(tokenizer))
312
+
313
+ if num_new_tokens > 0:
314
+ input_embeddings = self.get_input_embeddings().weight.data
315
+ output_embeddings = self.get_output_embeddings().weight.data
316
+
317
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
318
+ dim=0, keepdim=True)
319
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
320
+ dim=0, keepdim=True)
321
+
322
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
323
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
324
+
325
+ if model_args.tune_mm_mlp_adapter:
326
+ for p in self.get_input_embeddings().parameters():
327
+ p.requires_grad = True
328
+ for p in self.get_output_embeddings().parameters():
329
+ p.requires_grad = False
330
+
331
+ if model_args.pretrain_mm_mlp_adapter:
332
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
333
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
334
+ assert num_new_tokens == 6 # start/end tokens for image/video/audio
335
+ if input_embeddings.shape == embed_tokens_weight.shape:
336
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
337
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
338
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
339
+ else:
340
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
341
+ elif model_args.mm_use_im_patch_token:
342
+ if model_args.tune_mm_mlp_adapter:
343
+ for p in self.get_input_embeddings().parameters():
344
+ p.requires_grad = False
345
+ for p in self.get_output_embeddings().parameters():
346
+ p.requires_grad = False
videollama2/serve/cli.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, NUM_FRAMES
5
+ from videollama2.conversation import conv_templates, SeparatorStyle
6
+ from videollama2.model.builder import load_pretrained_model
7
+ from videollama2.utils import disable_torch_init
8
+ from videollama2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, tokenizer_MMODAL_token
9
+
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+
13
+ import requests
14
+ from io import BytesIO
15
+ from transformers import TextStreamer
16
+
17
+
18
+ def load_image(image_file):
19
+ if image_file.startswith('http://') or image_file.startswith('https://'):
20
+ response = requests.get(image_file)
21
+ image = Image.open(BytesIO(response.content)).convert('RGB')
22
+ else:
23
+ image = Image.open(image_file).convert('RGB')
24
+ return image
25
+
26
+ def load_video(video_file):
27
+ decord_vr = VideoReader(uri=video_file, ctx=cpu(0))
28
+ duration = len(decord_vr)
29
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
30
+ video = decord_vr.get_batch(frame_id_list)
31
+ return video
32
+
33
+ def load_image_or_video(image_or_video_file):
34
+ if file_path.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
35
+ return load_image(image_file=image_or_video_file)
36
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
37
+ return load_video(video_file=image_or_video_file)
38
+ else:
39
+ raise Exception(f"File type of {image_or_video_file} not supported!!!")
40
+
41
+
42
+ def main(args):
43
+ # Model
44
+ disable_torch_init()
45
+
46
+ model_name = get_model_name_from_path(args.model_path)
47
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
48
+
49
+ # if "llama-2" in model_name.lower():
50
+ # conv_mode = "llava_llama_2"
51
+ # elif "mistral" in model_name.lower():
52
+ # conv_mode = "mistral_instruct"
53
+ # elif "v1.6-34b" in model_name.lower():
54
+ # conv_mode = "chatml_direct"
55
+ # elif "v1" in model_name.lower():
56
+ # conv_mode = "llava_v1"
57
+ # elif "mpt" in model_name.lower():
58
+ # conv_mode = "mpt"
59
+ # else:
60
+ # conv_mode = "llava_v0"
61
+ conv_mode = "llava_v1" # fix conversation mode for now
62
+
63
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
64
+ print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
65
+ else:
66
+ args.conv_mode = conv_mode
67
+
68
+ conv = conv_templates[args.conv_mode].copy()
69
+ if "mpt" in model_name.lower():
70
+ roles = ('user', 'assistant')
71
+ else:
72
+ roles = conv.roles
73
+
74
+ image = load_image(args.image_file)
75
+ image_size = image.size
76
+ # Similar operation in model_worker.py
77
+ image_tensor = process_images([image], image_processor, model.config)
78
+ if type(image_tensor) is list:
79
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
80
+ else:
81
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
82
+
83
+ while True:
84
+ try:
85
+ inp = input(f"{roles[0]}: ")
86
+ except EOFError:
87
+ inp = ""
88
+ if not inp:
89
+ print("exit...")
90
+ break
91
+
92
+ print(f"{roles[1]}: ", end="")
93
+
94
+ if image is not None:
95
+ # first message
96
+ if model.config.mm_use_im_start_end:
97
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
98
+ else:
99
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
100
+ conv.append_message(conv.roles[0], inp)
101
+ image = None
102
+ else:
103
+ # later messages
104
+ conv.append_message(conv.roles[0], inp)
105
+ conv.append_message(conv.roles[1], None)
106
+ prompt = conv.get_prompt()
107
+
108
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
109
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
110
+ keywords = [stop_str]
111
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
112
+
113
+ with torch.inference_mode():
114
+ output_ids = model.generate(
115
+ input_ids,
116
+ images=image_tensor,
117
+ image_sizes=[image_size],
118
+ do_sample=True if args.temperature > 0 else False,
119
+ temperature=args.temperature,
120
+ max_new_tokens=args.max_new_tokens,
121
+ streamer=streamer,
122
+ use_cache=True)
123
+
124
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
125
+ conv.messages[-1][-1] = outputs
126
+
127
+ if args.debug:
128
+ print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
134
+ parser.add_argument("--model-base", type=str, default=None)
135
+ parser.add_argument("--image-file", type=str, required=True)
136
+ parser.add_argument("--device", type=str, default="cuda")
137
+ parser.add_argument("--conv-mode", type=str, default=None)
138
+ parser.add_argument("--temperature", type=float, default=0.2)
139
+ parser.add_argument("--max-new-tokens", type=int, default=512)
140
+ parser.add_argument("--load-8bit", action="store_true")
141
+ parser.add_argument("--load-4bit", action="store_true")
142
+ parser.add_argument("--debug", action="store_true")
143
+ args = parser.parse_args()
144
+ main(args)
videollama2/serve/controller.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A controller manages distributed workers.
3
+ It sends worker addresses to clients.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import dataclasses
8
+ from enum import Enum, auto
9
+ import json
10
+ import logging
11
+ import time
12
+ from typing import List, Union
13
+ import threading
14
+
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.responses import StreamingResponse
17
+ import numpy as np
18
+ import requests
19
+ import uvicorn
20
+
21
+ from videollama2.constants import CONTROLLER_HEART_BEAT_EXPIRATION
22
+ from videollama2.utils import build_logger, server_error_msg
23
+
24
+
25
+ logger = build_logger("controller", "controller.log")
26
+
27
+
28
+ class DispatchMethod(Enum):
29
+ LOTTERY = auto()
30
+ SHORTEST_QUEUE = auto()
31
+
32
+ @classmethod
33
+ def from_str(cls, name):
34
+ if name == "lottery":
35
+ return cls.LOTTERY
36
+ elif name == "shortest_queue":
37
+ return cls.SHORTEST_QUEUE
38
+ else:
39
+ raise ValueError(f"Invalid dispatch method")
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class WorkerInfo:
44
+ model_names: List[str]
45
+ speed: int
46
+ queue_length: int
47
+ check_heart_beat: bool
48
+ last_heart_beat: str
49
+
50
+
51
+ def heart_beat_controller(controller):
52
+ while True:
53
+ time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
54
+ controller.remove_stable_workers_by_expiration()
55
+
56
+
57
+ class Controller:
58
+ def __init__(self, dispatch_method: str):
59
+ # Dict[str -> WorkerInfo]
60
+ self.worker_info = {}
61
+ self.dispatch_method = DispatchMethod.from_str(dispatch_method)
62
+
63
+ self.heart_beat_thread = threading.Thread(
64
+ target=heart_beat_controller, args=(self,), daemon=True)
65
+ self.heart_beat_thread.start()
66
+
67
+ logger.info("Init controller")
68
+
69
+ def register_worker(self, worker_name: str, check_heart_beat: bool,
70
+ worker_status: dict):
71
+ if worker_name not in self.worker_info:
72
+ logger.info(f"Register a new worker: {worker_name}")
73
+ else:
74
+ logger.info(f"Register an existing worker: {worker_name}")
75
+
76
+ if not worker_status:
77
+ worker_status = self.get_worker_status(worker_name)
78
+ if not worker_status:
79
+ return False
80
+
81
+ self.worker_info[worker_name] = WorkerInfo(
82
+ worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
83
+ check_heart_beat, time.time())
84
+
85
+ logger.info(f"Register done: {worker_name}, {worker_status}")
86
+ return True
87
+
88
+ def get_worker_status(self, worker_name: str):
89
+ try:
90
+ r = requests.post(worker_name + "/worker_get_status", timeout=5)
91
+ except requests.exceptions.RequestException as e:
92
+ logger.error(f"Get status fails: {worker_name}, {e}")
93
+ return None
94
+
95
+ if r.status_code != 200:
96
+ logger.error(f"Get status fails: {worker_name}, {r}")
97
+ return None
98
+
99
+ return r.json()
100
+
101
+ def remove_worker(self, worker_name: str):
102
+ del self.worker_info[worker_name]
103
+
104
+ def refresh_all_workers(self):
105
+ old_info = dict(self.worker_info)
106
+ self.worker_info = {}
107
+
108
+ for w_name, w_info in old_info.items():
109
+ if not self.register_worker(w_name, w_info.check_heart_beat, None):
110
+ logger.info(f"Remove stale worker: {w_name}")
111
+
112
+ def list_models(self):
113
+ model_names = set()
114
+
115
+ for w_name, w_info in self.worker_info.items():
116
+ model_names.update(w_info.model_names)
117
+
118
+ return list(model_names)
119
+
120
+ def get_worker_address(self, model_name: str):
121
+ if self.dispatch_method == DispatchMethod.LOTTERY:
122
+ worker_names = []
123
+ worker_speeds = []
124
+ for w_name, w_info in self.worker_info.items():
125
+ if model_name in w_info.model_names:
126
+ worker_names.append(w_name)
127
+ worker_speeds.append(w_info.speed)
128
+ worker_speeds = np.array(worker_speeds, dtype=np.float32)
129
+ norm = np.sum(worker_speeds)
130
+ if norm < 1e-4:
131
+ return ""
132
+ worker_speeds = worker_speeds / norm
133
+ if True: # Directly return address
134
+ pt = np.random.choice(np.arange(len(worker_names)),
135
+ p=worker_speeds)
136
+ worker_name = worker_names[pt]
137
+ return worker_name
138
+
139
+ # Check status before returning
140
+ while True:
141
+ pt = np.random.choice(np.arange(len(worker_names)),
142
+ p=worker_speeds)
143
+ worker_name = worker_names[pt]
144
+
145
+ if self.get_worker_status(worker_name):
146
+ break
147
+ else:
148
+ self.remove_worker(worker_name)
149
+ worker_speeds[pt] = 0
150
+ norm = np.sum(worker_speeds)
151
+ if norm < 1e-4:
152
+ return ""
153
+ worker_speeds = worker_speeds / norm
154
+ continue
155
+ return worker_name
156
+ elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
157
+ worker_names = []
158
+ worker_qlen = []
159
+ for w_name, w_info in self.worker_info.items():
160
+ if model_name in w_info.model_names:
161
+ worker_names.append(w_name)
162
+ worker_qlen.append(w_info.queue_length / w_info.speed)
163
+ if len(worker_names) == 0:
164
+ return ""
165
+ min_index = np.argmin(worker_qlen)
166
+ w_name = worker_names[min_index]
167
+ self.worker_info[w_name].queue_length += 1
168
+ logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
169
+ return w_name
170
+ else:
171
+ raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
172
+
173
+ def receive_heart_beat(self, worker_name: str, queue_length: int):
174
+ if worker_name not in self.worker_info:
175
+ logger.info(f"Receive unknown heart beat. {worker_name}")
176
+ return False
177
+
178
+ self.worker_info[worker_name].queue_length = queue_length
179
+ self.worker_info[worker_name].last_heart_beat = time.time()
180
+ logger.info(f"Receive heart beat. {worker_name}")
181
+ return True
182
+
183
+ def remove_stable_workers_by_expiration(self):
184
+ expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
185
+ to_delete = []
186
+ for worker_name, w_info in self.worker_info.items():
187
+ if w_info.check_heart_beat and w_info.last_heart_beat < expire:
188
+ to_delete.append(worker_name)
189
+
190
+ for worker_name in to_delete:
191
+ self.remove_worker(worker_name)
192
+
193
+ def worker_api_generate_stream(self, params):
194
+ worker_addr = self.get_worker_address(params["model"])
195
+ if not worker_addr:
196
+ logger.info(f"no worker: {params['model']}")
197
+ ret = {
198
+ "text": server_error_msg,
199
+ "error_code": 2,
200
+ }
201
+ yield json.dumps(ret).encode() + b"\0"
202
+
203
+ try:
204
+ response = requests.post(worker_addr + "/worker_generate_stream",
205
+ json=params, stream=True, timeout=5)
206
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
207
+ if chunk:
208
+ yield chunk + b"\0"
209
+ except requests.exceptions.RequestException as e:
210
+ logger.info(f"worker timeout: {worker_addr}")
211
+ ret = {
212
+ "text": server_error_msg,
213
+ "error_code": 3,
214
+ }
215
+ yield json.dumps(ret).encode() + b"\0"
216
+
217
+
218
+ # Let the controller act as a worker to achieve hierarchical
219
+ # management. This can be used to connect isolated sub networks.
220
+ def worker_api_get_status(self):
221
+ model_names = set()
222
+ speed = 0
223
+ queue_length = 0
224
+
225
+ for w_name in self.worker_info:
226
+ worker_status = self.get_worker_status(w_name)
227
+ if worker_status is not None:
228
+ model_names.update(worker_status["model_names"])
229
+ speed += worker_status["speed"]
230
+ queue_length += worker_status["queue_length"]
231
+
232
+ return {
233
+ "model_names": list(model_names),
234
+ "speed": speed,
235
+ "queue_length": queue_length,
236
+ }
237
+
238
+
239
+ app = FastAPI()
240
+
241
+
242
+ @app.post("/register_worker")
243
+ async def register_worker(request: Request):
244
+ data = await request.json()
245
+ controller.register_worker(
246
+ data["worker_name"], data["check_heart_beat"],
247
+ data.get("worker_status", None))
248
+
249
+
250
+ @app.post("/refresh_all_workers")
251
+ async def refresh_all_workers():
252
+ models = controller.refresh_all_workers()
253
+
254
+
255
+ @app.post("/list_models")
256
+ async def list_models():
257
+ models = controller.list_models()
258
+ return {"models": models}
259
+
260
+
261
+ @app.post("/get_worker_address")
262
+ async def get_worker_address(request: Request):
263
+ data = await request.json()
264
+ addr = controller.get_worker_address(data["model"])
265
+ return {"address": addr}
266
+
267
+
268
+ @app.post("/receive_heart_beat")
269
+ async def receive_heart_beat(request: Request):
270
+ data = await request.json()
271
+ exist = controller.receive_heart_beat(
272
+ data["worker_name"], data["queue_length"])
273
+ return {"exist": exist}
274
+
275
+
276
+ @app.post("/worker_generate_stream")
277
+ async def worker_api_generate_stream(request: Request):
278
+ params = await request.json()
279
+ generator = controller.worker_api_generate_stream(params)
280
+ return StreamingResponse(generator)
281
+
282
+
283
+ @app.post("/worker_get_status")
284
+ async def worker_api_get_status(request: Request):
285
+ return controller.worker_api_get_status()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument("--host", type=str, default="localhost")
291
+ parser.add_argument("--port", type=int, default=21001)
292
+ parser.add_argument("--dispatch-method", type=str, choices=[
293
+ "lottery", "shortest_queue"], default="shortest_queue")
294
+ args = parser.parse_args()
295
+ logger.info(f"args: {args}")
296
+
297
+ controller = Controller(args.dispatch_method)
298
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/examples/1034346401.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08b62a634fe49edc0a19fc53f6ea5cfb345d9b2a6a7047811344c16832dc42b2
3
+ size 1678095
videollama2/serve/examples/desert.jpg ADDED
videollama2/serve/examples/extreme_ironing.jpg ADDED
videollama2/serve/examples/sample_demo_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6562a172eb9cb3c760a3c9992349c1faa2c793c112b7b9e50bd5cb17c2164d
3
+ size 1549315
videollama2/serve/examples/sample_demo_3.mp4 ADDED
Binary file (464 kB). View file
 
videollama2/serve/examples/sample_demo_9.mp4 ADDED
Binary file (632 kB). View file
 
videollama2/serve/examples/waterview.jpg ADDED
videollama2/serve/gradio_web_server.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import hashlib
5
+ import requests
6
+ import argparse
7
+ import datetime
8
+
9
+ import numpy as np
10
+ import gradio as gr
11
+ from decord import VideoReader, cpu
12
+
13
+ from videollama2.constants import LOGDIR, NUM_FRAMES
14
+ from videollama2.conversation import (default_conversation, conv_templates,SeparatorStyle)
15
+ from videollama2.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
16
+
17
+
18
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
19
+
20
+ headers = {"User-Agent": "Videollama2 Client"}
21
+
22
+ no_change_btn = gr.Button.update()
23
+ enable_btn = gr.Button.update(interactive=True)
24
+ disable_btn = gr.Button.update(interactive=False)
25
+
26
+ priority = {
27
+ "vicuna-13b": "aaaaaaa",
28
+ "koala-13b": "aaaaaab",
29
+ }
30
+
31
+
32
+ def get_conv_log_filename():
33
+ t = datetime.datetime.now()
34
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
35
+ return name
36
+
37
+
38
+ def get_model_list():
39
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
40
+ assert ret.status_code == 200
41
+ ret = requests.post(args.controller_url + "/list_models")
42
+ models = ret.json()["models"]
43
+ models.sort(key=lambda x: priority.get(x, x))
44
+ logger.info(f"Models: {models}")
45
+ return models
46
+
47
+
48
+ get_window_url_params = """
49
+ function() {
50
+ const params = new URLSearchParams(window.location.search);
51
+ url_params = Object.fromEntries(params);
52
+ console.log(url_params);
53
+ return url_params;
54
+ }
55
+ """
56
+
57
+
58
+ def load_demo(url_params, request: gr.Request):
59
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
60
+
61
+ dropdown_update = gr.Dropdown.update(visible=True)
62
+ if "model" in url_params:
63
+ model = url_params["model"]
64
+ if model in models:
65
+ dropdown_update = gr.Dropdown.update(
66
+ value=model, visible=True)
67
+
68
+ state = default_conversation.copy()
69
+ return state, dropdown_update
70
+
71
+
72
+ def load_demo_refresh_model_list(request: gr.Request):
73
+ logger.info(f"load_demo. ip: {request.client.host}")
74
+ models = get_model_list()
75
+ state = default_conversation.copy()
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=models,
78
+ value=models[0] if len(models) > 0 else ""
79
+ )
80
+ return state, dropdown_update
81
+
82
+
83
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
84
+ with open(get_conv_log_filename(), "a") as fout:
85
+ data = {
86
+ "tstamp": round(time.time(), 4),
87
+ "type": vote_type,
88
+ "model": model_selector,
89
+ "state": state.dict(),
90
+ "ip": request.client.host,
91
+ }
92
+ fout.write(json.dumps(data) + "\n")
93
+
94
+
95
+ def upvote_last_response(state, model_selector, request: gr.Request):
96
+ logger.info(f"upvote. ip: {request.client.host}")
97
+ vote_last_response(state, "upvote", model_selector, request)
98
+ return ("",) + (disable_btn,) * 3
99
+
100
+
101
+ def downvote_last_response(state, model_selector, request: gr.Request):
102
+ logger.info(f"downvote. ip: {request.client.host}")
103
+ vote_last_response(state, "downvote", model_selector, request)
104
+ return ("",) + (disable_btn,) * 3
105
+
106
+
107
+ def flag_last_response(state, model_selector, request: gr.Request):
108
+ logger.info(f"flag. ip: {request.client.host}")
109
+ vote_last_response(state, "flag", model_selector, request)
110
+ return ("",) + (disable_btn,) * 3
111
+
112
+
113
+ def regenerate(state, image_process_mode, request: gr.Request):
114
+ logger.info(f"regenerate. ip: {request.client.host}")
115
+ state.messages[-1][-1] = None
116
+ prev_human_msg = state.messages[-2]
117
+ if type(prev_human_msg[1]) in (tuple, list):
118
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
119
+ state.skip_next = False
120
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
121
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
122
+
123
+
124
+ def clear_history(request: gr.Request):
125
+ logger.info(f"clear_history. ip: {request.client.host}")
126
+ state = default_conversation.copy()
127
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
128
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
129
+
130
+
131
+ def add_text_ori(state, text, image, video, image_process_mode, request: gr.Request):
132
+ # note: imagebox itself is PIL object while videobox is filepath
133
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
134
+ if len(text) <= 0 and image is None:
135
+ state.skip_next = True
136
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
137
+ if args.moderate:
138
+ flagged = violates_moderation(text)
139
+ if flagged:
140
+ state.skip_next = True
141
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
142
+ no_change_btn,) * 5
143
+ assert image is None or video is None, "Please don't feed image and video inputs at the same time!!!"
144
+ text = text[:1536] # Hard cut-off
145
+ if image is not None:
146
+ # here image is the PIL object itself
147
+ text = text[:1200] # Hard cut-off for images
148
+ if '<image>' not in text:
149
+ # text = '<Image><image></Image>' + text
150
+ text = text + '\n<image>'
151
+ text = (text, image, image_process_mode)
152
+ if len(state.get_images(return_pil=True)) > 0:
153
+ state = default_conversation.copy()
154
+ state.modality = "image"
155
+ if video is not None:
156
+ print("Video box:", video)
157
+ # here video is the file path of video
158
+ text = text[:1200] # Hard cut-off for images
159
+ if '<video>' not in text:
160
+ # text = '<Image><image></Image>' + text
161
+ text = text + '\n<video>'
162
+ text = (text, video, image_process_mode)
163
+ if len(state.get_videos(return_pil=True)) > 0:
164
+ state = default_conversation.copy()
165
+ state.modality = "video"
166
+ print("Set modality as video...")
167
+ state.append_message(state.roles[0], text)
168
+ state.append_message(state.roles[1], None)
169
+ state.skip_next = False
170
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
171
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
172
+
173
+
174
+ def add_text(state, text, image, video, image_process_mode, request: gr.Request):
175
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
176
+
177
+ # if input is new video or image ,reset the state
178
+ if image is not None or video is not None:
179
+ state = default_conversation.copy()
180
+
181
+ if len(text) <= 0 and image is None and video is None:
182
+ state.skip_next = True
183
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
184
+
185
+ if args.moderate:
186
+ flagged = violates_moderation(text)
187
+ if flagged:
188
+ state.skip_next = True
189
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (no_change_btn,) * 5
190
+
191
+ # process the input video
192
+ if video is not None:
193
+ text = text[:1200] #
194
+ if '<video>' not in text:
195
+ text = text + '\n<video>'
196
+ text = (text, video, image_process_mode)
197
+ state.modality = "video"
198
+ # process the input image
199
+ elif image is not None:
200
+ text = text[:1200] #
201
+ if '<image>' not in text:
202
+ text = text + '\n<image>'
203
+ text = (text, image, image_process_mode)
204
+ state.modality = "image"
205
+ elif state.modality == "image" and len(text)>0:
206
+ state.modality = "image_text"
207
+ text = text[:1536] # Hard cut-off
208
+ elif state.modality == "video" and len(text)>0:
209
+ state.modality = "video_text"
210
+ text = text[:1536] # Hard cut-off
211
+
212
+ state.append_message(state.roles[0], text)
213
+ state.append_message(state.roles[1], None)
214
+ state.skip_next = False
215
+
216
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
217
+
218
+
219
+ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
220
+ logger.info(f"http_bot. ip: {request.client.host}")
221
+ start_tstamp = time.time()
222
+ model_name = model_selector
223
+
224
+ if state.skip_next:
225
+ # This generate call is skipped due to invalid inputs
226
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
227
+ return
228
+
229
+ if len(state.messages) == state.offset + 2:
230
+ # First round of conversation
231
+ if "llava" in model_name.lower():
232
+ if 'llama-2' in model_name.lower():
233
+ template_name = "llava_llama_2"
234
+ elif "v1" in model_name.lower():
235
+ if 'mmtag' in model_name.lower():
236
+ template_name = "v1_mmtag"
237
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
238
+ template_name = "v1_mmtag"
239
+ else:
240
+ template_name = "llava_v1"
241
+ elif "mpt" in model_name.lower():
242
+ template_name = "mpt"
243
+ else:
244
+ if 'mmtag' in model_name.lower():
245
+ template_name = "v0_mmtag"
246
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
247
+ template_name = "v0_mmtag"
248
+ else:
249
+ template_name = "llava_v0"
250
+ elif "mpt" in model_name:
251
+ template_name = "mpt_text"
252
+ elif "llama-2" in model_name:
253
+ template_name = "llama_2"
254
+ else:
255
+ template_name = "vicuna_v1"
256
+ template_name = "llava_v1"
257
+ new_state = conv_templates[template_name].copy()
258
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
259
+ new_state.append_message(new_state.roles[1], None)
260
+ new_state.modality = state.modality
261
+ state = new_state
262
+
263
+ # Query worker address
264
+ controller_url = args.controller_url
265
+ ret = requests.post(controller_url + "/get_worker_address",
266
+ json={"model": model_name})
267
+ worker_addr = ret.json()["address"]
268
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
269
+
270
+ # No available worker
271
+ if worker_addr == "":
272
+ state.messages[-1][-1] = server_error_msg
273
+ yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
274
+ return
275
+
276
+ # Construct prompt
277
+ prompt = state.get_prompt()
278
+ if state.modality == "image" or state.modality == "image_text":
279
+ all_images = state.get_images(return_pil=True) # return PIL.Image object
280
+ elif state.modality == "video" or state.modality == "video_text":
281
+ all_images = state.get_videos(return_pil=True) # return video frames where each frame is a PIL.Image object
282
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
283
+ for idx, (image, hash) in enumerate(zip(all_images, all_image_hash)):
284
+ t = datetime.datetime.now()
285
+ if state.modality == "image" or state.modality == "image_text":
286
+ filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
287
+ elif state.modality == "video" or state.modality == "video_text":
288
+ filename = os.path.join(LOGDIR, "serve_videos", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}_{idx}.jpg")
289
+ if not os.path.isfile(filename):
290
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
291
+ image.save(filename)
292
+
293
+ # Make requests
294
+ pload = {
295
+ "model": model_name,
296
+ "prompt": prompt,
297
+ "temperature": float(temperature),
298
+ "top_p": float(top_p),
299
+ "max_new_tokens": min(int(max_new_tokens), 1536),
300
+ "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
301
+ #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
302
+ "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
303
+ }
304
+ logger.info(f"==== request ====\n{pload}")
305
+
306
+ if state.modality == "image" or state.modality == "image_text":
307
+ pload['images'] = state.get_images()
308
+ elif state.modality == "video" or state.modality == "video_text":
309
+ pload['images'] = state.get_videos()
310
+
311
+ state.messages[-1][-1] = "▌"
312
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
313
+
314
+ try:
315
+ # Stream output
316
+ response = requests.post(worker_addr + "/worker_generate_stream",
317
+ headers=headers, json=pload, stream=True, timeout=10)
318
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
319
+ if chunk:
320
+ data = json.loads(chunk.decode())
321
+ if data["error_code"] == 0:
322
+ output = data["text"][len(prompt):].strip()
323
+ state.messages[-1][-1] = output + "▌"
324
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
325
+ else:
326
+ output = data["text"] + f" (error_code: {data['error_code']})"
327
+ state.messages[-1][-1] = output
328
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
329
+ return
330
+ time.sleep(0.03)
331
+ except requests.exceptions.RequestException as e:
332
+ state.messages[-1][-1] = server_error_msg
333
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
334
+ return
335
+
336
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
337
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
338
+
339
+ finish_tstamp = time.time()
340
+ logger.info(f"{output}")
341
+
342
+ with open(get_conv_log_filename(), "a") as fout:
343
+ data = {
344
+ "tstamp": round(finish_tstamp, 4),
345
+ "type": "chat",
346
+ "model": model_name,
347
+ "start": round(start_tstamp, 4),
348
+ "finish": round(start_tstamp, 4),
349
+ #"state": state.dict(),
350
+ "images": all_image_hash,
351
+ "ip": request.client.host,
352
+ }
353
+ fout.write(json.dumps(data) + "\n")
354
+
355
+ title_markdown = ("""
356
+ # The publicl release of VideoLLaMA2
357
+ """)
358
+
359
+ tos_markdown = ("""
360
+ ### Terms of use
361
+ By using this service, users are required to agree to the following terms:
362
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
363
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
364
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
365
+ """)
366
+
367
+
368
+ learn_more_markdown = ("""
369
+ ### License
370
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
371
+ """)
372
+
373
+ block_css = """
374
+
375
+ #buttons button {
376
+ min-width: min(120px,100%);
377
+ }
378
+
379
+ """
380
+
381
+ def build_demo(embed_mode):
382
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
383
+ with gr.Blocks(title="Video-Llama", theme=gr.themes.Default(), css=block_css) as demo:
384
+ state = gr.State()
385
+
386
+ if not embed_mode:
387
+ gr.Markdown(title_markdown)
388
+
389
+ with gr.Row():
390
+ with gr.Column(scale=3):
391
+ with gr.Row(elem_id="model_selector_row"):
392
+ model_selector = gr.Dropdown(
393
+ choices=models,
394
+ value=models[0] if len(models) > 0 else "",
395
+ interactive=True,
396
+ show_label=False,
397
+ container=False)
398
+
399
+ imagebox = gr.Image(type="pil")
400
+ videobox = gr.Video()
401
+ image_process_mode = gr.Radio(
402
+ ["Crop", "Resize", "Pad", "Default"],
403
+ value="Default",
404
+ label="Preprocess for non-square image", visible=False)
405
+
406
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
407
+ gr.Examples(examples=[
408
+ [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
409
+ [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
410
+ [f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?"],
411
+ ], inputs=[imagebox, textbox], label="Image examples")
412
+
413
+ # video example inputs
414
+ gr.Examples(examples=[
415
+ [f"{cur_dir}/examples/sample_demo_1.mp4", "Why is this video funny?"],
416
+ [f"{cur_dir}/examples/sample_demo_3.mp4", "Can you identify any safety hazards in this video?"],
417
+ [f"{cur_dir}/examples/1034346401.mp4", "What is this young woman doing?"]
418
+ ], inputs=[videobox, textbox], label="Video examples")
419
+ #[f"{cur_dir}/examples/sample_demo_9.mp4", "Describe the video in detail and please do not generate repetitive content."]
420
+
421
+ with gr.Accordion("Parameters", open=False) as parameter_row:
422
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
423
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
424
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
425
+
426
+ with gr.Column(scale=8):
427
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Videollama2 Chatbot", height=550)
428
+ with gr.Row():
429
+ with gr.Column(scale=8):
430
+ textbox.render()
431
+ with gr.Column(scale=1, min_width=50):
432
+ submit_btn = gr.Button(value="Send", variant="primary")
433
+ with gr.Row(elem_id="buttons") as button_row:
434
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
435
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
436
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
437
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
438
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
439
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
440
+
441
+ if not embed_mode:
442
+ gr.Markdown(tos_markdown)
443
+ gr.Markdown(learn_more_markdown)
444
+ url_params = gr.JSON(visible=False)
445
+
446
+ # Register listeners
447
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
448
+ upvote_btn.click(upvote_last_response,
449
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
450
+ downvote_btn.click(downvote_last_response,
451
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
452
+ flag_btn.click(flag_last_response,
453
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
454
+ regenerate_btn.click(regenerate, [state, image_process_mode],
455
+ [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
456
+ http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
457
+ [state, chatbot] + btn_list)
458
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
459
+
460
+ textbox.submit(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
461
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
462
+ [state, chatbot] + btn_list)
463
+ submit_btn.click(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
464
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
465
+ [state, chatbot] + btn_list)
466
+
467
+ if args.model_list_mode == "once":
468
+ demo.load(load_demo, [url_params], [state, model_selector],
469
+ _js=get_window_url_params)
470
+ elif args.model_list_mode == "reload":
471
+ demo.load(load_demo_refresh_model_list, None, [state, model_selector])
472
+ else:
473
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
474
+
475
+ return demo
476
+
477
+
478
+ if __name__ == "__main__":
479
+ parser = argparse.ArgumentParser()
480
+ parser.add_argument("--host", type=str, default="0.0.0.0")
481
+ parser.add_argument("--port", type=int)
482
+ parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
483
+ parser.add_argument("--concurrency-count", type=int, default=10)
484
+ parser.add_argument("--model-list-mode", type=str, default="once",
485
+ choices=["once", "reload"])
486
+ parser.add_argument("--share", action="store_true")
487
+ parser.add_argument("--moderate", action="store_true")
488
+ parser.add_argument("--embed", action="store_true")
489
+ args = parser.parse_args()
490
+ logger.info(f"args: {args}")
491
+
492
+ models = get_model_list()
493
+
494
+ logger.info(args)
495
+ demo = build_demo(args.embed)
496
+ demo.queue(
497
+ concurrency_count=args.concurrency_count,
498
+ api_open=False
499
+ ).launch(
500
+ server_name=args.host,
501
+ server_port=args.port,
502
+ share=args.share
503
+ )
videollama2/serve/model_worker.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import os
5
+ import json
6
+ import time
7
+ import uuid
8
+ import asyncio
9
+ import requests
10
+ import argparse
11
+ import threading
12
+ from threading import Thread
13
+ from functools import partial
14
+ from typing import Iterator, List, Optional, Tuple
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, Request, BackgroundTasks
18
+ from fastapi.responses import StreamingResponse
19
+
20
+ import torch
21
+ import decord
22
+ import numpy as np
23
+ from PIL import Image
24
+ from decord import VideoReader, cpu
25
+ from transformers import TextIteratorStreamer
26
+
27
+ from videollama2.constants import WORKER_HEART_BEAT_INTERVAL
28
+ from videollama2.utils import (build_logger, server_error_msg, pretty_print_semaphore)
29
+ from videollama2.model.builder import load_pretrained_model
30
+ from videollama2.mm_utils import process_images, process_videos, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, tokenizer_MMODAL_token
31
+ from videollama2.mm_utils import chunk_list, frame_expansion
32
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, NUM_FRAMES, MMODAL_TOKEN_INDEX
33
+
34
+
35
+ GB = 1 << 30
36
+
37
+ worker_id = str(uuid.uuid4())[:6]
38
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
39
+ global_counter = 0
40
+
41
+ model_semaphore = None
42
+
43
+
44
+ # variable_content = os.getenv('MY_VARIABLE', '')
45
+ # KEYWORDS_LIST = set(variable_content.split('\n'))
46
+ KEYWORDS_LIST = []
47
+ path = 'assets/keywords.txt'
48
+ if os.path.exists(path):
49
+ with open(path, 'r', encoding='utf-8') as file:
50
+ for line in file:
51
+
52
+ KEYWORDS_LIST.append(line.strip())
53
+ else:
54
+ KEYWORDS_LIST = []
55
+
56
+
57
+ KEYWORD_BLOCK_MESSAGE2 = "The output contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
58
+ KEYWORD_BLOCK_MESSAGE1 = "Your input question contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
59
+ STREAM_CHECK_MULTIPLE = 20
60
+
61
+
62
+ def heart_beat_worker(controller):
63
+
64
+ while True:
65
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
66
+ controller.send_heart_beat()
67
+
68
+
69
+ def safety_check(text, history=None, ) -> Optional[str]:
70
+
71
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
72
+ print('############')
73
+ return KEYWORD_BLOCK_MESSAGE2
74
+
75
+ return None
76
+
77
+
78
+ def input_safety_check(text) -> Optional[str]:
79
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
80
+ print('######## Input keyword alarm triggered:', text)
81
+ return KEYWORD_BLOCK_MESSAGE1
82
+ return None
83
+
84
+
85
+ class ModelWorker:
86
+
87
+ def __init__(self, controller_addr, worker_addr,
88
+ worker_id, no_register,
89
+ model_path, model_base, model_name,
90
+ load_8bit, load_4bit, device):
91
+ self.controller_addr = controller_addr
92
+ self.worker_addr = worker_addr
93
+ self.worker_id = worker_id
94
+ self.model_path = model_path
95
+ if model_path.endswith("/"):
96
+ model_path = model_path[:-1]
97
+ if model_name is None:
98
+ model_paths = model_path.split("/")
99
+ if model_paths[-1].startswith('checkpoint-'):
100
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
101
+ else:
102
+ self.model_name = model_paths[-1]
103
+ else:
104
+ self.model_name = model_name
105
+
106
+ self.device = device
107
+ logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
108
+ self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
109
+ model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
110
+ self.is_multimodal = 'videollama2' in self.model_name.lower() or 'vlb' in self.model_name.lower()
111
+
112
+ if not no_register:
113
+ self.register_to_controller()
114
+ self.heart_beat_thread = threading.Thread(
115
+ target=heart_beat_worker, args=(self,))
116
+ self.heart_beat_thread.start()
117
+
118
+ def register_to_controller(self):
119
+ logger.info("Register to controller")
120
+
121
+ url = self.controller_addr + "/register_worker"
122
+ data = {
123
+ "worker_name": self.worker_addr,
124
+ "check_heart_beat": True,
125
+ "worker_status": self.get_status()
126
+ }
127
+ r = requests.post(url, json=data)
128
+ assert r.status_code == 200
129
+
130
+ def send_heart_beat(self):
131
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
132
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
133
+ f"global_counter: {global_counter}")
134
+
135
+ url = self.controller_addr + "/receive_heart_beat"
136
+
137
+ while True:
138
+ try:
139
+ ret = requests.post(url, json={
140
+ "worker_name": self.worker_addr,
141
+ "queue_length": self.get_queue_length()}, timeout=5)
142
+ exist = ret.json()["exist"]
143
+ break
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"heart beat error: {e}")
146
+ time.sleep(5)
147
+
148
+ if not exist:
149
+ self.register_to_controller()
150
+
151
+ def get_queue_length(self):
152
+ if model_semaphore is None:
153
+ return 0
154
+ else:
155
+ return args.limit_model_concurrency - model_semaphore._value + (len(
156
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
157
+
158
+ def get_status(self):
159
+ return {
160
+ "model_names": [self.model_name],
161
+ "speed": 1,
162
+ "queue_length": self.get_queue_length(),
163
+ }
164
+
165
+ @torch.inference_mode()
166
+ def generate_stream(self, params):
167
+ tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
168
+
169
+ prompt = params["prompt"]
170
+ ori_prompt = prompt
171
+ images_or_videos = params.get("images", None)
172
+ #print("Input images:", images_or_videos)
173
+ num_image_tokens = 0
174
+ modal_list = []
175
+ if images_or_videos is not None and len(images_or_videos) and self.is_multimodal:
176
+ if len(images_or_videos) > 0:
177
+ if len(images_or_videos) != prompt.count(DEFAULT_IMAGE_TOKEN) and len(images_or_videos) != (prompt.count(DEFAULT_VIDEO_TOKEN)):
178
+ raise ValueError("Number of images/videos does not match number of <image>/<video> tokens in prompt")
179
+
180
+ try:
181
+ print("Load image...")
182
+ images_or_videos = [load_image_from_base64(image) for image in images_or_videos]
183
+ images_or_videos = process_images(images_or_videos, image_processor, model.config)
184
+
185
+ modal_list = ["image"]
186
+ replace_token = DEFAULT_IMAGE_TOKEN
187
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
188
+ except:
189
+ print("Load video instead...")
190
+ decord_vr = VideoReader(uri=images_or_videos[0], ctx=cpu(0))
191
+ duration = len(decord_vr)
192
+ if not "use_taug" in self.model_path:
193
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
194
+ video_frames = decord_vr.get_batch(frame_id_list).asnumpy()
195
+ images_or_videos = process_videos(video_frames, image_processor, model.config)
196
+ else:
197
+ print("Temporal augmentation activated!!!")
198
+ frame_id_list = np.linspace(0, duration-1, 8 * 2 * 2, dtype=int)
199
+ video_data = decord_vr.get_batch(frame_id_list)
200
+ video_frames = [Image.fromarray(f) for f in video_data.asnumpy()]
201
+ chunked_video_frames = chunk_list(video_frames, 2*2)
202
+ expanded_video_frames = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
203
+ images_or_videos = process_videos(expanded_video_frames, image_processor, model.config)
204
+
205
+ # frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
206
+ # images_or_videos = decord_vr.get_batch(frame_id_list).asnumpy()
207
+ # images_or_videos = process_videos(images_or_videos, image_processor, model.config)
208
+ #print("images_or_videos.shape:", images_or_videos.shape)
209
+ modal_list = ["video"]
210
+ replace_token = DEFAULT_VIDEO_TOKEN
211
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
212
+
213
+ if type(images_or_videos) is list:
214
+ images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
215
+ else:
216
+ images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
217
+ if modal_list[0] == "video":
218
+ print("Video:", images_or_videos.shape)
219
+ images_or_videos = [images_or_videos]
220
+ else:
221
+ print("Image:", images_or_videos.shape)
222
+
223
+
224
+ #image_sizes = [image.size for image in images_or_videos]
225
+
226
+
227
+ # if len(images_or_videos) % NUM_FRAMES == 0:
228
+ # images_or_videos = process_images(images_or_videos, image_processor, model.config)
229
+ # #images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
230
+ # #modal_list = ["image"] * len(images_or_videos)
231
+ # images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
232
+ # modal_list = ["video"]
233
+ # replace_token = DEFAULT_VIDEO_TOKEN
234
+ # else:
235
+
236
+ if getattr(self.model.config, 'mm_use_im_start_end', False):
237
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
238
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
239
+
240
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
241
+ else:
242
+ images = None
243
+ modal_list = []
244
+ image_args = {"images_or_videos": images_or_videos, "modal_list": modal_list}
245
+ else:
246
+ images = None
247
+ image_args = {}
248
+ print("image_args:", image_args)
249
+ temperature = float(params.get("temperature", 1.0))
250
+ top_p = float(params.get("top_p", 1.0))
251
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
252
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
253
+ stop_str = params.get("stop", None)
254
+ do_sample = True if temperature > 0.001 else False
255
+
256
+ #input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
257
+ # tokenizer for our video-llama beta
258
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to(self.device)
259
+ #print("Current prompt:", prompt)
260
+ #print("input_ids.shape:", input_ids.shape)
261
+ keywords = [stop_str]
262
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
263
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
264
+
265
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
266
+
267
+ if max_new_tokens < 1:
268
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
269
+ return
270
+
271
+ thread = Thread(target=model.generate, kwargs=dict(
272
+ inputs=input_ids,
273
+ do_sample=do_sample,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ max_new_tokens=max_new_tokens,
277
+ streamer=streamer,
278
+ stopping_criteria=[stopping_criteria],
279
+ use_cache=True,
280
+ **image_args
281
+ ))
282
+ thread.start()
283
+
284
+ generated_text = ori_prompt
285
+ token_count = 0
286
+ for new_text in streamer:
287
+ generated_text += new_text
288
+ token_count += len(tokenizer.encode(new_text))
289
+ if token_count >= STREAM_CHECK_MULTIPLE:
290
+ safety_message = safety_check(generated_text)
291
+ if safety_message:
292
+ print('####### Keyword alarm triggered:', generated_text)
293
+ yield json.dumps({"text": safety_message , "error_code": 1}).encode() + b"\0"
294
+ return
295
+ token_count = 0 #
296
+
297
+
298
+ if generated_text.endswith(stop_str):
299
+ generated_text = generated_text[:-len(stop_str)]
300
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
301
+
302
+ def generate_stream_gate(self, params):
303
+ try:
304
+ input_text = params.get("prompt", "")
305
+ safety_message = input_safety_check(input_text)
306
+ if safety_message:
307
+ yield json.dumps({"text": safety_message, "error_code": 1}).encode() + b"\0"
308
+ return
309
+
310
+ for x in self.generate_stream(params):
311
+ yield x
312
+ except ValueError as e:
313
+ print("Caught ValueError:", e)
314
+ ret = {
315
+ "text": server_error_msg,
316
+ "error_code": 1,
317
+ }
318
+ yield json.dumps(ret).encode() + b"\0"
319
+ except torch.cuda.CudaError as e:
320
+ print("Caught torch.cuda.CudaError:", e)
321
+ ret = {
322
+ "text": server_error_msg,
323
+ "error_code": 1,
324
+ }
325
+ yield json.dumps(ret).encode() + b"\0"
326
+ except Exception as e:
327
+ print("Caught Unknown Error", e)
328
+ ret = {
329
+ "text": server_error_msg,
330
+ "error_code": 1,
331
+ }
332
+ yield json.dumps(ret).encode() + b"\0"
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ def release_model_semaphore(fn=None):
339
+ model_semaphore.release()
340
+ if fn is not None:
341
+ fn()
342
+
343
+
344
+ @app.post("/worker_generate_stream")
345
+ async def generate_stream(request: Request):
346
+ global model_semaphore, global_counter
347
+ global_counter += 1
348
+ params = await request.json()
349
+
350
+ if model_semaphore is None:
351
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
352
+ await model_semaphore.acquire()
353
+ worker.send_heart_beat()
354
+ generator = worker.generate_stream_gate(params)
355
+ background_tasks = BackgroundTasks()
356
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
357
+ return StreamingResponse(generator, background=background_tasks)
358
+
359
+
360
+ @app.post("/worker_get_status")
361
+ async def get_status(request: Request):
362
+ return worker.get_status()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ parser = argparse.ArgumentParser()
367
+ parser.add_argument("--host", type=str, default="localhost")
368
+ parser.add_argument("--port", type=int, default=21002)
369
+ parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
370
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
371
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
372
+ parser.add_argument("--model-base", type=str, default=None)
373
+ parser.add_argument("--model-name", type=str)
374
+ parser.add_argument("--device", type=str, default="cuda")
375
+ parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
376
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
377
+ parser.add_argument("--stream-interval", type=int, default=1)
378
+ parser.add_argument("--no-register", action="store_true")
379
+ parser.add_argument("--load-8bit", action="store_true")
380
+ parser.add_argument("--load-4bit", action="store_true")
381
+ args = parser.parse_args()
382
+ logger.info(f"args: {args}")
383
+
384
+ if args.multi_modal:
385
+ logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
386
+
387
+ worker = ModelWorker(args.controller_address,
388
+ args.worker_address,
389
+ worker_id,
390
+ args.no_register,
391
+ args.model_path,
392
+ args.model_base,
393
+ args.model_name,
394
+ args.load_8bit,
395
+ args.load_4bit,
396
+ args.device)
397
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")