Spaces:

fffiloni
/

Sa2VA-simple-demo

Running on Zero

App Files Files Community

fffiloni commited on 1 day ago

Commit

d59f323

verified ·

1 Parent(s): 1281541

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
LICENSE +201 -0
ORIGINAL_README.md +166 -0
assets/images/teaser.jpg +0 -0
assets/videos/apt_exp_1_all.gif +3 -0
assets/videos/apt_exp_2_all.gif +3 -0
assets/videos/baodao_exp_1_all.gif +3 -0
assets/videos/exp_1.gif +3 -0
assets/videos/exp_2.gif +3 -0
assets/videos/gf_exp1.gif +3 -0
assets/videos/gf_exp1.mp4 +3 -0
demo.ipynb +0 -0
demo.py +98 -0
demo/demo.py +98 -0
demo/requirements.txt +10 -0
projects/glamm/datasets/__init__.py +7 -0
projects/glamm/datasets/collate_fns/glamm_collate_fn.py +136 -0
projects/glamm/datasets/gcg_dataset.py +349 -0
projects/glamm/datasets/refcoco_segm_dataset.py +195 -0
projects/glamm/datasets/region_level_dataset.py +297 -0
projects/glamm/datasets/semantic_seg_dataset.py +424 -0
projects/glamm/datasets/utils/ade20k_classes.json +30 -0
projects/glamm/datasets/utils/cocostuff_classes.txt +183 -0
projects/glamm/datasets/utils/utils.py +131 -0
projects/glamm/models/glamm.py +183 -0
projects/glamm/models/region_encoder.py +359 -0
projects/glamm/utils.py +280 -0
projects/llava_sam2/configs/sa2va_4b.py +548 -0
projects/llava_sam2/datasets/ChatUniVi_Dataset.py +389 -0
projects/llava_sam2/datasets/GCG_Dataset.py +375 -0
projects/llava_sam2/datasets/Grand_Dataset.py +241 -0
projects/llava_sam2/datasets/MeVIS_Dataset.py +5 -0
projects/llava_sam2/datasets/Osprey_Dataset.py +463 -0
projects/llava_sam2/datasets/ReSAM2_Dataset.py +489 -0
projects/llava_sam2/datasets/ReVOS_Dataset.py +602 -0
projects/llava_sam2/datasets/RefCOCO_Dataset.py +338 -0
projects/llava_sam2/datasets/RefYoutubeVOS_Dataset.py +47 -0
projects/llava_sam2/datasets/__init__.py +15 -0
projects/llava_sam2/datasets/collect_fns.py +206 -0
projects/llava_sam2/datasets/encode_fn.py +144 -0
projects/llava_sam2/datasets/gcg_process.py +297 -0
projects/llava_sam2/datasets/grand_process.py +110 -0
projects/llava_sam2/datasets/utils.py +58 -0
projects/llava_sam2/datasets/vqa_dataset.py +509 -0
projects/llava_sam2/deepspeed_zero2_sam2.json +24 -0
projects/llava_sam2/gradio/app.py +151 -0
projects/llava_sam2/gradio/app_utils.py +293 -0
projects/llava_sam2/models/__init__.py +3 -0
projects/llava_sam2/models/extension/__init__.py +1 -0
projects/llava_sam2/models/extension/sam2_base.py +281 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/videos/apt_exp_1_all.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/apt_exp_2_all.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/baodao_exp_1_all.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/exp_1.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/exp_2.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/gf_exp1.gif filter=lfs diff=lfs merge=lfs -text
+assets/videos/gf_exp1.mp4 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
+[\[🏠 Sa2VA\]](https://lxtgh.github.io/project/sa2va)  [\[📜 arXiv\]](https://arxiv.org/abs/2501.04001) [\[🤗 HuggingFace\]](https://huggingface.co/collections/ByteDance/sa2va-model-zoo-677e3084d71b5f108d00e093) [\[🎥 Introduction\]]() [\[🧑‍💻 GitHub\]](https://github.com/magic-research/Sa2VA) [\[Online Demo (Sa2VA-4B)\]](https://5512470799b6b35fbc.gradio.live/)
+[**Haobo Yuan**](https://yuanhaobo.me/)<sup>1*</sup> · [**Xiangtai Li**](https://scholar.google.com/citations?user=NmHgX-wAAAAJ)<sup>2*&dagger;</sup> · [**Tao Zhang**](https://zhang-tao-whu.github.io/)<sup>2,3*</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup> · [**Shilin Xu**](https://xushilin1.github.io/)<sup>4</sup> ·[**Shunping Ji**](https://scholar.google.com/citations?user=FjoRmF4AAAAJ&hl=en)<sup>3</sup> ·[**Yunhai Tong**](https://scholar.google.com/citations?user=T4gqdPkAAAAJ&hl=zh-CN)<sup>4</sup> ·
+[**Lu Qi**](https://luqi.info/)<sup>2</sup> · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Ming-Hsuan Yang**](https://faculty.ucmerced.edu/mhyang/)<sup>1</sup>
+<sup>1</sup>UC Merced&emsp;&emsp;&emsp;&emsp;<sup>2</sup>ByteDance Seed&emsp;&emsp;&emsp;&emsp;<sup>3</sup>WHU&emsp;&emsp;&emsp;&emsp;<sup>4</sup>PKU
+&dagger; project lead&emsp;* the first three authors equally contribute to the work.
+![Teaser](assets/images/teaser.jpg)
+## Overiew
+This repository contains the code for the paper "Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos".
+Sa2VA is the the first unified model for dense grounded understanding of both images and videos. Unlike existing multi-modal large language models, which are often limited to specific modalities and tasks, Sa2VA supports a wide range of image and video tasks, including referring segmentation and conversation, with minimal one-shot instruction tuning. Sa2VA combines SAM-2, a foundation video segmentation model, with LLaVA, an advanced vision-language model, and unifies text, image, and video into a shared LLM token space.
+## Model Zoo
+We provide the following models:
+| Model Name |                             Base MLLM                             |                                 Language Part                                 |                       HF Link                        |
+|:----------:|:-----------------------------------------------------------------:|:-----------------------------------------------------------------------------:|:----------------------------------------------------:|
+|  Sa2VA-1B  | [InternVL2.0-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) |   [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)    | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-1B) |
+|  Sa2VA-4B  | [InternVL2.5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B) |    [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)     | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-4B) |
+|  Sa2VA-8B  | [InternVL2.5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B) |  [internlm2_5-7b-chat](https://huggingface.co/internlm/internlm2_5-7b-chat)   | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-8B) |
+## Gradio Demos
+We provide a script that implements interactive chat using gradio, which requires installing `gradio==4.42.0`. You can try it to quickly build a chat interface locally.
+```shell
+PYTHONPATH=. python projects/llava_sam2/gradio/app.py ByteDance/Sa2VA-4B
+```
+## Quick Start
+Our Sa2VA model is available on 🤗HuggingFace. With very few steps, you can try it with your own data. You can install the `demo/requirements.txt` to avoid training-only packages.
+**Option1 - scripts:**
+Supposing you have a folder (`PATH_TO_FOLDER`) that contains images of a video, you can use the following script to chat with the Sa2VA model or segment the objects in the videos.
+```bash
+> cd scripts
+> python demo.py PATH_TO_FOLDER --model_path ByteDance/Sa2VA-8B --work-dir OUTPUT_DIR --text "<image>Please describe the video content."
+```
+If the output contains the segmentation results, the results will be saved to `OUTPUT_DIR`.
+**Option2 - Jupter Notebook:**
+Please refer to `demo.ipynb`.
+## Demo
+<details open>
+<summary>Demo 1</summary>
+Input Video (Source: La La Land 2016):
+![Error](assets/videos/exp_1.gif)
+Instruction: "Please segment the girl wearing the yellow dress."
+</details>
+<details open>
+<summary>Demo 2</summary>
+Input Video (Source: La La Land 2016):
+![Error](assets/videos/exp_2.gif)
+Instruction: "Please segment the main character."
+</details>
+<details open>
+<summary>Demo 3</summary>
+Input Video (Source: Internet):
+![Error](assets/videos/apt_exp_1_all.gif)
+Instruction: "Please segment the person wearing sun glasses."
+</details>
+<details open>
+<summary>Demo 4</summary>
+Input Video (Source: Internet):
+![Error](assets/videos/apt_exp_2_all.gif)
+Instruction: "Instruction: "Please segment the singing girl."
+</details>
+<details open>
+<summary>Demo 5</summary>
+Input Video:
+![Error](assets/videos/gf_exp1.gif)
+Instruction: "What is the atmosphere of the scene?"
+Answer: "The scene has a dark and mysterious atmosphere, with the men dressed in suits and ties, and the dimly lit room."
+</details>
+## Training
+<details open>
+<summary>Installation</summary>
+1. Please install the python and pytorch first:
+```bash
+> conda create -n vlm python=3.10
+> conda activate vlm
+> conda install pytorch==2.3.1 torchvision==0.18.1 pytorch-cuda=12.1 cuda -c pytorch  -c "nvidia/label/cuda-12.1.0" -c "nvidia/label/cuda-12.1.1"
+```
+2. Install mmcv:
+```bash
+> pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.3/index.html
+```
+3. Install other dependencies:
+```bash
+> pip install -r requirements.txt
+```
+</details>
+<details open>
+<summary>Pretrained Model Preparation</summary>
+You are expected to download the following pretrained models and place them in the `./pretrained` directory:
+- [sam2_hiera_large.pt](https://huggingface.co/facebook/sam2-hiera-large)
+- [InternVL2_5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B)
+</details>
+<details open>
+<summary>Data Preparation</summary>
+(TODO) Please download the training datasets and place them in the `data` directory. The download link is [here](https://huggingface.co/datasets/Dense-World/Sa2VA-Training).
+</details>
+<details open>
+<summary>Training Script</summary>
+Please run the following script to train:
+```bash
+> bash tools/dist.sh train projects/llava_sam2/configs/sa2va_4b.py 8
+```
+</details>
+## References
+If you find this repository useful, please consider referring the following paper:
+```
+@article{sa2va,
+  title={Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos},
+  author={Yuan, Haobo and Li, Xiangtai and Zhang, Tao and Huang, Zilong and Xu, Shilin and Ji, Shunping and Tong, Yunhai and Qi, Lu and Feng, Jiashi and Yang, Ming-Hsuan},
+  journal={arXiv},
+  year={2025}
+}
+```

assets/images/teaser.jpg ADDED Viewed

assets/videos/apt_exp_1_all.gif ADDED Viewed

Git LFS Details

SHA256: ddf6e915c5f5f00e11136b4342c63b601fd446f714967333db4995c6ee4b797c
Pointer size: 132 Bytes
Size of remote file: 1.11 MB

assets/videos/apt_exp_2_all.gif ADDED Viewed

Git LFS Details

SHA256: eb9a946270dd9d3a1f1f0b30ff55d70abea9cf54bc52499cb07813e80a8f1e33
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

assets/videos/baodao_exp_1_all.gif ADDED Viewed

Git LFS Details

SHA256: e762e253dafb71ecf90d48144422bcd6fdcdf9c6a3c67571ee1a9d0232e32f03
Pointer size: 132 Bytes
Size of remote file: 2.95 MB

assets/videos/exp_1.gif ADDED Viewed

Git LFS Details

SHA256: 7b63b1465808dbe658761936b61a10f3e72bfc04f0b144a9e9103fcfaa810147
Pointer size: 132 Bytes
Size of remote file: 4.26 MB

assets/videos/exp_2.gif ADDED Viewed

Git LFS Details

SHA256: fad52f51a9f4238106923217e1d60c3ebc563c77117c49988496a67699ead397
Pointer size: 132 Bytes
Size of remote file: 3.84 MB

assets/videos/gf_exp1.gif ADDED Viewed

Git LFS Details

SHA256: 2cb7962fa6d20f4535b07e526c8a65edfcee55d5c2ec79308f98dde24c209842
Pointer size: 132 Bytes
Size of remote file: 4.82 MB

assets/videos/gf_exp1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:272f4246fbb62aa690811e01d5f8aecaac3d157cc01a9859de79675ee5d4f7cf
+size 15332128

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import argparse
+import os
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import cv2
+try:
+    from mmengine.visualization import Visualizer
+except ImportError:
+    Visualizer = None
+    print("Warning: mmengine is not installed, visualization is disabled.")
+def parse_args():
+    parser = argparse.ArgumentParser(description='Video Reasoning Segmentation')
+    parser.add_argument('image_folder', help='Path to image file')
+    parser.add_argument('--model_path', default="ByteDance/Sa2VA-8B")
+    parser.add_argument('--work-dir', default=None, help='The dir to save results.')
+    parser.add_argument('--text', type=str, default="<image>Please describe the video content.")
+    parser.add_argument('--select', type=int, default=-1)
+    args = parser.parse_args()
+    return args
+def visualize(pred_mask, image_path, work_dir):
+    visualizer = Visualizer()
+    img = cv2.imread(image_path)
+    visualizer.set_image(img)
+    visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
+    visual_result = visualizer.get_image()
+    output_path = os.path.join(work_dir, os.path.basename(image_path))
+    cv2.imwrite(output_path, visual_result)
+if __name__ == "__main__":
+    cfg = parse_args()
+    model_path = cfg.model_path
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    image_files = []
+    image_paths = []
+    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
+    for filename in sorted(list(os.listdir(cfg.image_folder))):
+        if os.path.splitext(filename)[1].lower() in image_extensions:
+            image_files.append(filename)
+            image_paths.append(os.path.join(cfg.image_folder, filename))
+    vid_frames = []
+    for img_path in image_paths:
+        img = Image.open(img_path).convert('RGB')
+        vid_frames.append(img)
+    if cfg.select > 0:
+        img_frame = vid_frames[cfg.select - 1]
+        print(f"Selected frame {cfg.select}")
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            image=img_frame,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+    else:
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            video=vid_frames,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+    prediction = result['prediction']
+    print(f"The output is:\n{prediction}")
+    if '[SEG]' in prediction and Visualizer is not None:
+        _seg_idx = 0
+        pred_masks = result['prediction_masks'][_seg_idx]
+        for frame_idx in range(len(vid_frames)):
+            pred_mask = pred_masks[frame_idx]
+            if cfg.work_dir:
+                os.makedirs(cfg.work_dir, exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], cfg.work_dir)
+            else:
+                os.makedirs('./temp_visualize_results', exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], './temp_visualize_results')
+    else:
+        pass

demo/demo.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import argparse
+import os
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import cv2
+try:
+    from mmengine.visualization import Visualizer
+except ImportError:
+    Visualizer = None
+    print("Warning: mmengine is not installed, visualization is disabled.")
+def parse_args():
+    parser = argparse.ArgumentParser(description='Video Reasoning Segmentation')
+    parser.add_argument('image_folder', help='Path to image file')
+    parser.add_argument('--model_path', default="ByteDance/Sa2VA-8B")
+    parser.add_argument('--work-dir', default=None, help='The dir to save results.')
+    parser.add_argument('--text', type=str, default="<image>Please describe the video content.")
+    parser.add_argument('--select', type=int, default=-1)
+    args = parser.parse_args()
+    return args
+def visualize(pred_mask, image_path, work_dir):
+    visualizer = Visualizer()
+    img = cv2.imread(image_path)
+    visualizer.set_image(img)
+    visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
+    visual_result = visualizer.get_image()
+    output_path = os.path.join(work_dir, os.path.basename(image_path))
+    cv2.imwrite(output_path, visual_result)
+if __name__ == "__main__":
+    cfg = parse_args()
+    model_path = cfg.model_path
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    image_files = []
+    image_paths = []
+    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
+    for filename in sorted(list(os.listdir(cfg.image_folder))):
+        if os.path.splitext(filename)[1].lower() in image_extensions:
+            image_files.append(filename)
+            image_paths.append(os.path.join(cfg.image_folder, filename))
+    vid_frames = []
+    for img_path in image_paths:
+        img = Image.open(img_path).convert('RGB')
+        vid_frames.append(img)
+    if cfg.select > 0:
+        img_frame = vid_frames[cfg.select - 1]
+        print(f"Selected frame {cfg.select}")
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            image=img_frame,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+    else:
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            video=vid_frames,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+    prediction = result['prediction']
+    print(f"The output is:\n{prediction}")
+    if '[SEG]' in prediction and Visualizer is not None:
+        _seg_idx = 0
+        pred_masks = result['prediction_masks'][_seg_idx]
+        for frame_idx in range(len(vid_frames)):
+            pred_mask = pred_masks[frame_idx]
+            if cfg.work_dir:
+                os.makedirs(cfg.work_dir, exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], cfg.work_dir)
+            else:
+                os.makedirs('./temp_visualize_results', exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], './temp_visualize_results')
+    else:
+        pass

demo/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.3.1
+torchvision==0.18.1
+transformers==4.42.3
+opencv-python-headless<4.10
+peft<0.14.0
+timm==1.0.9
+einops==0.8.0
+flash_attn
+sentencepiece==0.2.0
+mmengine<1

projects/glamm/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .semantic_seg_dataset import SemanticSegDataset, ADE20kSemanticSegDataset, \
+    COCOStuffSemanticSegDataset, PascalPartSemanticSegDataset, PacoSemanticSegDataset
+from .gcg_dataset import GCGDataset, GranDfGCGDataset, RefCOCOgGCGDataset, OpenPsgGCGDataset, Flickr30kGCGDataset
+from .region_level_dataset import RefCocoGRegionDataset, VisualGenomeRegionDataset
+from .refcoco_segm_dataset import ReferSegmDataset
+from .utils.utils import *
+from .collate_fns.glamm_collate_fn import glamm_collate_fn

projects/glamm/datasets/collate_fns/glamm_collate_fn.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from typing import Dict, Sequence
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from xtuner.parallel.sequence import (get_sequence_parallel_world_size,
+                                      pad_for_sequence_parallel)
+from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX
+def glamm_collate_fn(instances: Sequence[Dict],
+                       pad_index: int = DEFAULT_PAD_TOKEN_INDEX,
+                       return_hf_format: bool = False,
+                       use_varlen_attn: bool = False):
+    seq_parallel_world_size = get_sequence_parallel_world_size()
+    input_ids, labels = [], []
+    has_image = any(inst.get('pixel_values') is not None for inst in instances)
+    has_grounding_image = any(inst.get('g_pixel_values') is not None for inst in instances)
+    has_mask = any(inst.get('masks') is not None for inst in instances)
+    has_bboxes = any(inst.get('bboxes') is not None for inst in instances)
+    has_points = any(inst.get('points') is not None for inst in instances)
+    if use_varlen_attn:
+        position_ids, cumulative_len = [], []
+        assert len(instances) == 1, (
+            f'If utilizing varlen attention, the batch size should be'
+            f' set to 1, but got {len(instances)}')
+        assert not has_image, 'Currently, it is not configured to '
+        'accommodate the use of varlen Attention in multimodal training'
+    if has_image:
+        pixel_values = []
+    if has_grounding_image:
+        grounding_pixel_values = []
+    if has_mask:
+        object_masks = []
+    if has_bboxes:
+        object_bboxes = []
+    if has_points:
+        prompt_points = []
+    for example in instances:
+        input_ids.append(torch.LongTensor(example['input_ids']))
+        labels.append(torch.LongTensor(example['labels']))
+        if use_varlen_attn:
+            cumulative_len.append(torch.IntTensor(example['cumulative_len']))
+            position_ids.append(torch.LongTensor(example['position_ids']))
+        if has_image:
+            pixel_values.append(example['pixel_values'])
+        if has_grounding_image:
+            grounding_pixel_values.append(example['g_pixel_values'])
+        if has_mask:
+            if 'masks' in example.keys() and example['masks'] is not None:
+                object_masks.append(example['masks'])
+        if has_bboxes:
+            if 'bboxes' in example.keys() and example['bboxes'] is not None:
+                object_bboxes.append(example['bboxes'])
+        if has_points:
+            if 'points' in example.keys() and example['points'] is not None:
+                prompt_points.append(example['points'])
+    ori_length = [len(ids) for ids in input_ids]
+    if len(instances) > 1:
+        input_ids = pad_sequence(
+            input_ids, batch_first=True, padding_value=pad_index)
+        labels = pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX)
+    else:
+        input_ids = torch.stack(input_ids)
+        labels = torch.stack(labels)
+    if use_varlen_attn:
+        assert input_ids.size(1) % seq_parallel_world_size == 0
+        attention_mask = None
+        position_ids = torch.stack(position_ids, dim=0)
+    else:
+        # Some tokenizers have the same eos token and pad token, so input_ids
+        # cannot be masked directly based on the pad token id.
+        attention_mask = torch.zeros_like(input_ids).bool()
+        for i, length in enumerate(ori_length):
+            attention_mask[i, :length] = True
+        bs, seq_len = input_ids.shape
+        position_ids = torch.arange(seq_len).unsqueeze(0).long().repeat(bs, 1)
+    if seq_parallel_world_size > 1:
+        input_ids = pad_for_sequence_parallel(input_ids, pad_index)
+        labels = pad_for_sequence_parallel(labels, IGNORE_INDEX)
+        position_ids = pad_for_sequence_parallel(position_ids, 0)
+        if attention_mask is not None:
+            attention_mask = pad_for_sequence_parallel(attention_mask, 0)
+    if use_varlen_attn:
+        max_seqlen = (
+            cumulative_len[0][1:] -  # noqa: W504
+            cumulative_len[0][:-1]).max().item()
+        data_dict = {
+            'input_ids': input_ids,
+            'cumulative_len': cumulative_len,
+            'position_ids': position_ids,
+            'labels': labels,
+            'max_seqlen': max_seqlen
+        }
+    else:
+        data_dict = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'labels': labels
+        }
+    if has_image:
+        if all(x.shape == pixel_values[0].shape for x in pixel_values):
+            pixel_values = torch.stack(pixel_values, dim=0)
+        data_dict['pixel_values'] = pixel_values
+    if has_grounding_image:
+        # if all(x.shape == grounding_pixel_values[0].shape for x in grounding_pixel_values):
+            # grounding_pixel_values = torch.stack(grounding_pixel_values, dim=0)
+        data_dict['g_pixel_values'] = grounding_pixel_values
+    if has_mask:
+        data_dict['masks'] = object_masks
+    if has_bboxes:
+        data_dict['bboxes'] = object_bboxes
+    if has_points:
+        data_dict['points'] = prompt_points
+    if return_hf_format:
+        return data_dict
+    else:
+        return {'data': data_dict, 'data_samples': None}

projects/glamm/datasets/gcg_dataset.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import GCG_QUESTIONS, ANSWER_LIST
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+class GCGDataset(Dataset):
+    def __init__(self,
+                 image_folder,
+                 image_processor,
+                 data_path=None,
+                 tokenizer=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 repeats=1,
+                 num_classes_per_sample=3,
+                 extra_image_processor=None):
+        super().__init__()
+        self.question_templates = GCG_QUESTIONS
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.num_classes_per_sample = num_classes_per_sample
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+        reg_tokens = ['<bbox>', '<point>']
+        segmentation_tokens = ['[SEG]']
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.max_length = max_length
+        self.template_map_fn = BUILDER.build(template_map_fn)
+        self.text_data = self.json_file_preprocess(data_path, image_folder)
+        self.image_folder = image_folder
+        self.image_processor = BUILDER.build(image_processor)
+        size = self.image_processor.crop_size
+        if isinstance(size, dict):
+            self.image_w, self.image_h = size['width'], size['height']
+        elif isinstance(size, int):
+            self.image_h, self.image_w = size, size
+        else:
+            self.image_w, self.image_h = size
+        self.pad_image_to_square = pad_image_to_square
+        self.repeats = repeats
+    def json_file_preprocess(self, data_path, image_folder=None):
+        with open(data_path, 'r') as f:
+            json_data = json.load(f)
+        return json_data
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.text_data:
+            cur_len = 100
+            length_list.append(cur_len)
+        return length_list * self.repeats
+    def __len__(self):
+        return len(self.text_data) * self.repeats
+    def real_len(self):
+        return len(self.text_data)
+    def _parse_annotations(self, ann_info):
+        image_path = os.path.join(self.image_folder, ann_info['file_name'])
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image) # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            ann_info['g_pixel_values'] = g_pixel_values
+        width, height = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        ann_info['pixel_values'] = image
+        caption = ann_info['caption'].strip('"').strip()
+        masks, phrases, tokens_positive = [], [], []
+        for word, grounding in ann_info["groundings"].items():
+            phrases.append(word)
+            tokens_positive.append(grounding["token_positives"])
+            # Convert segmentation to binary mask
+            binary_mask = np.zeros((height, width), dtype=np.uint8)
+            for rle in grounding["rle_masks"]:
+                m = mask_utils.decode(rle).astype(np.uint8)
+                binary_mask += m.squeeze()
+            masks.append(binary_mask)
+        def sort_by_start_index(items, order):
+            return [items[i] for i in order]
+        phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+        masks = sort_by_start_index(masks, phrase_order)
+        phrases = sort_by_start_index(phrases, phrase_order)
+        tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+        ann_info.update({
+            'image_path': image_path,
+            'caption': caption,
+            'masks': masks,
+            'phrases': phrases,
+            'tokens_positive': tokens_positive,
+        })
+        return ann_info
+    def create_conversation(self, caption, tokens_positive):
+        question = random.choice(self.question_templates).strip()
+        # Prepare caption with tags
+        def tag_caption(caption, tokens):
+            for start, end in sorted(tokens, key=lambda x: x[0], reverse=True):
+                caption = f"{caption[:start]}<p> {caption[start:end]} </p> [SEG]{caption[end:]}"
+            return caption
+        detailed_answer = tag_caption(caption, tokens_positive)
+        question = 'The <image> provides an overview of the picture.\n' + question
+        conversation = [{'input': question, 'output': detailed_answer}]
+        return conversation
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = {}
+        ann_info = copy.deepcopy(self.text_data[index])
+        ann_info = self._parse_annotations(ann_info)
+        data_dict['g_pixel_values'] = ann_info.pop('g_pixel_values')
+        data_dict['pixel_values'] = ann_info.pop('pixel_values')
+        if len(ann_info['masks']) == 0:
+            return self.__getitem__(0)
+        data_dict['masks'] = torch.from_numpy(np.stack(ann_info['masks'], axis=0))
+        conversation = self.create_conversation(ann_info['caption'], ann_info['tokens_positive'])
+        data_dict['conversation'] = conversation
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length, with_image_token=True)
+        data_dict.update(result)
+        return data_dict
+class GranDfGCGDataset(GCGDataset):
+    pass
+class RefCOCOgGCGDataset(GCGDataset):
+    def json_file_preprocess(self, data_path, image_folder=None):
+        with open(data_path, 'r') as f:
+            json_data = json.load(f)
+        return [list(line.values())[0] for line in json_data]
+    def _parse_annotations(self, ann_info):
+        image_path = os.path.join(self.image_folder, ann_info['img_file_name'])
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image) # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            ann_info['g_pixel_values'] = g_pixel_values
+        width, height = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        ann_info['pixel_values'] = image
+        caption = ann_info['caption'].strip('"').strip().lower()
+        masks, phrases, tokens_positive = [], [], []
+        for detail in ann_info['refs']:
+            phrase = detail['sentence']
+            if phrase.lower() in caption:
+                phrases.append(phrase)
+                index = caption.find(phrase)
+                end_index = index + len(phrase) if index != -1 else -1
+                tokens_positive.append([index, end_index])
+                binary_mask = np.zeros((height, width), dtype=np.uint8)
+                for seg in detail["segmentation"]:
+                    rles = mask_utils.frPyObjects([seg], height, width)
+                    m = mask_utils.decode(rles)
+                    m = m.astype(np.uint8)
+                    binary_mask += m.squeeze()
+                masks.append(binary_mask)
+        def sort_by_start_index(items, order):
+            return [items[i] for i in order]
+        phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+        masks = sort_by_start_index(masks, phrase_order)
+        phrases = sort_by_start_index(phrases, phrase_order)
+        tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+        ann_info.update({
+            'image_path': image_path,
+            'caption': caption,
+            'masks': masks,
+            'phrases': phrases,
+            'tokens_positive': tokens_positive,
+        })
+        return ann_info
+class OpenPsgGCGDataset(GCGDataset):
+    pass
+class Flickr30kGCGDataset(GCGDataset):
+    def json_file_preprocess(self, data_path, image_folder=None):
+        def filter_images(data_infos, min_size):
+            return [i for i, info in enumerate(data_infos) if min(info['width'], info['height']) >= min_size]
+        self.coco = COCO(data_path)
+        self.image_ids = self.coco.getImgIds()
+        data_infos = []
+        total_ann_ids = []
+        removed_img_count = 0
+        for img_id in self.image_ids:
+            info = self.coco.loadImgs([img_id])[0]
+            if len(info['caption'].split(' ')) < 3:
+                removed_img_count += 1
+                continue
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            data_infos.append(info)
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(total_ann_ids), f"Non-unique annotation IDs in '{data_path}'!"
+        print(f'Removed {removed_img_count} images.')
+        data_infos = [data_infos[i] for i in filter_images(data_infos, min_size=32)]
+        return data_infos
+    def _parse_annotations(self, img_info):
+        ann_ids = self.coco.getAnnIds(imgIds=img_info['id'])
+        ann_info = self.coco.loadAnns(ann_ids)
+        annotations = {'phrases': [], 'caption': img_info['caption'], 'masks': [], 'tokens_positive': []}
+        image_path = os.path.join(self.image_folder, img_info['file_name'])
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image) # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            annotations['g_pixel_values'] = g_pixel_values
+        width, height = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        annotations['pixel_values'] = image
+        for ann in ann_info:
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0 or ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            tokens_positive = ann['tokens_positive']
+            phrase = [img_info['caption'][span[0]:span[1]] for span in tokens_positive]
+            annotations['phrases'].append(phrase[0])
+            annotations['tokens_positive'].append(tokens_positive[0])
+            rle = ann['sam_mask']
+            mask_decoded = mask_utils.decode(rle).astype(np.uint8)
+            annotations['masks'].append(mask_decoded)
+        def sort_by_start_index(items, order):
+            return [items[i] for i in order]
+        phrase_order = sorted(range(len(annotations['tokens_positive'])), key=lambda x: annotations['tokens_positive'][x][0])
+        annotations['masks'] = sort_by_start_index(annotations['masks'], phrase_order)
+        annotations['phrases'] = sort_by_start_index(annotations['phrases'], phrase_order)
+        annotations['tokens_positive'] = sort_by_start_index(annotations['tokens_positive'], phrase_order)
+        return annotations
+if __name__ == '__main__':
+    from transformers import CLIPImageProcessor, AutoTokenizer
+    from third_parts.segment_anything.utils.transforms import ResizeLongestSide
+    pretrained_model = 'MBZUAI/GLaMM-GranD-Pretrained'
+    llm_name_or_path = 'lmsys/vicuna-7b-v1.5'
+    tokenizer = dict(
+        type=AutoTokenizer.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path)
+    image_processor = dict(
+        type=CLIPImageProcessor.from_pretrained,
+        pretrained_model_name_or_path='openai/clip-vit-large-patch14-336')
+    extra_image_processor = dict(
+        type=ResizeLongestSide,
+        target_length=1024,
+    )
+    from xtuner.utils.templates import PROMPT_TEMPLATE
+    prompt_template = PROMPT_TEMPLATE.vicuna
+    from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory, template_map_fn
+    from projects.glamm.datasets.collate_fns.glamm_collate_fn import glamm_collate_fn
+    dataset = Flickr30kGCGDataset(
+        image_folder='data/flickr30k/flickr30k-images/',
+        image_processor=image_processor,
+        data_path='./data/GranDf/annotations/train/flickr_mergedGT_GCG_train.json',
+        tokenizer=tokenizer,
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template),
+        max_length=2048,
+        pad_image_to_square=True,
+        repeats=1,
+        num_classes_per_sample=3,
+        extra_image_processor=extra_image_processor)
+    for i in range(1000):
+        print(dataset[i])

projects/glamm/datasets/refcoco_segm_dataset.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import SEG_QUESTIONS, ANSWER_LIST
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from third_parts.mmdet.datasets.refcoco import RefCocoDataset
+class ReferSegmDataset(RefCocoDataset):
+    def __init__(self,
+                 data_root,
+                 ann_file=None,
+                 split_file=None,
+                 image_processor=None,
+                 extra_image_processor=None,
+                 data_prefix=dict(img_path='train2014/'),
+                 tokenizer=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 num_classes_per_sample=3):
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            pipeline=None,
+            ann_file=ann_file,
+            split_file=split_file,
+        )
+        self.begin_str = f"""{DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.question_templates = SEG_QUESTIONS
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.num_classes_per_sample = num_classes_per_sample
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+        reg_tokens = ['<bbox>', '<point>']
+        segmentation_tokens = ['[SEG]']
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.max_length = max_length
+        self.template_map_fn = BUILDER.build(template_map_fn)
+        self.image_processor = BUILDER.build(image_processor)
+        size = self.image_processor.crop_size
+        if isinstance(size, dict):
+            self.image_w, self.image_h = size['width'], size['height']
+        self.pad_image_to_square = pad_image_to_square
+    @property
+    def modality_length(self):
+        import pickle
+        length_list = []
+        for idx in range(len(self)):
+            length_list.append(100)
+        # for idx in range(len(self)):
+        #     if self.serialize_data:
+        #         start_addr = 0 if idx == 0 else self.data_address[idx - 1].item()
+        #         end_addr = self.data_address[idx].item()
+        #         bytes = memoryview(
+        #             self.data_bytes[start_addr:end_addr])  # type: ignore
+        #         data_dict = pickle.loads(bytes)
+        #     else:
+        #         data_dict = copy.deepcopy(self.data_list[idx])
+        return length_list
+    def _parse_annotations(self, ann_info):
+        image_path = ann_info['img_path']
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(
+                g_image).permute(2, 0, 1).contiguous()
+            ann_info['g_pixel_values'] = g_pixel_values
+        width, height = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(
+            image, return_tensors='pt')['pixel_values'][0]
+        ann_info['pixel_values'] = image
+        masks, phrases = [], []
+        instances, text = ann_info['instances'], ann_info['text']
+        index = np.random.choice(range(len(instances)), min(
+            len(instances), self.num_classes_per_sample))
+        for idx in index:
+            inst = instances[idx]
+            phrase = text[idx].lower()
+            phrases.append(phrase)
+            binary_mask = np.zeros((height, width), dtype=np.uint8)
+            for seg in inst["mask"]:
+                rles = mask_utils.frPyObjects([seg], height, width)
+                m = mask_utils.decode(rles)
+                m = m.astype(np.uint8)
+                binary_mask += m.squeeze()
+            masks.append(binary_mask)
+        ann_info.update({
+            'masks': masks,
+            'phrases': phrases,
+        })
+        return ann_info
+    def __getitem__(self, idx):
+        data_dict = {}
+        ann_info = super().__getitem__(idx)
+        ann_info = self._parse_annotations(ann_info)
+        data_dict['g_pixel_values'] = ann_info.pop('g_pixel_values')
+        data_dict['pixel_values'] = ann_info.pop('pixel_values')
+        if len(ann_info['masks']) == 0:
+            return self.__getitem__(0)
+        data_dict['masks'] = torch.from_numpy(
+            np.stack(ann_info['masks'], axis=0))
+        conversation = []
+        for i, phrase in enumerate(ann_info['phrases']):
+            question = random.choice(SEG_QUESTIONS).format(class_name=phrase)
+            conversation.append(
+                {'input': question, 'output': random.choice(ANSWER_LIST)})
+        data_dict['conversation'] = conversation
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = encode_fn(data_dict, tokenizer=self.tokenizer,
+                           max_length=self.max_length, with_image_token=True)
+        data_dict.update(result)
+        return data_dict
+if __name__ == '__main__':
+    from transformers import CLIPImageProcessor, AutoTokenizer
+    from third_parts.segment_anything.utils.transforms import ResizeLongestSide
+    pretrained_model = 'MBZUAI/GLaMM-GranD-Pretrained'
+    llm_name_or_path = 'lmsys/vicuna-7b-v1.5'
+    tokenizer = dict(
+        type=AutoTokenizer.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path)
+    image_processor = dict(
+        type=CLIPImageProcessor.from_pretrained,
+        pretrained_model_name_or_path='openai/clip-vit-large-patch14-336')
+    extra_image_processor = dict(
+        type=ResizeLongestSide,
+        target_length=1024,
+    )
+    from xtuner.utils.templates import PROMPT_TEMPLATE
+    prompt_template = PROMPT_TEMPLATE.vicuna
+    from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory, template_map_fn
+    from projects.glamm.datasets.collate_fns.glamm_collate_fn import glamm_collate_fn
+    dataset = ReferSegmDataset(
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template),
+        extra_image_processor=extra_image_processor,
+        data_root='data/coco/',
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco+/instances.json',
+        split_file='refcoco+/refs(unc).p',
+    )
+    for i in range(1000):
+        dataset[i]

projects/glamm/datasets/region_level_dataset.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import ANSWER_LIST, REGION_QUESTIONS
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+class RegionDataset(Dataset):
+    def __init__(self,
+                 image_folder,
+                 image_processor,
+                 data_path=None,
+                 tokenizer=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 repeats=1,
+                 num_classes_per_sample=3,
+                 extra_image_processor=None):
+        super().__init__()
+        self.begin_str = f"""{DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.question_templates = REGION_QUESTIONS
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.num_classes_per_sample = num_classes_per_sample
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+        reg_tokens = ['<bbox>', '<point>']
+        segmentation_tokens = ['[SEG]']
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.max_length = max_length
+        self.template_map_fn = BUILDER.build(template_map_fn)
+        self.text_data = self._load_annotations(data_path, image_folder)
+        self.image_folder = image_folder
+        self.image_processor = BUILDER.build(image_processor)
+        size = self.image_processor.crop_size
+        if isinstance(size, dict):
+            self.image_w, self.image_h = size['width'], size['height']
+        elif isinstance(size, int):
+            self.image_h, self.image_w = size, size
+        else:
+            self.image_w, self.image_h = size
+        self.pad_image_to_square = pad_image_to_square
+        self.repeats = repeats
+    def _load_annotations(self, data_path, image_folder=None):
+        self.coco = COCO(data_path)
+        img_ids = self.coco.getImgIds()
+        data_infos = []
+        for img_id in img_ids:
+            info = self.coco.loadImgs([img_id])[0]
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            if min(info['height'], info['width']) < 32:
+                continue
+            data_infos.append(info)
+        return data_infos
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.text_data:
+            cur_len = 100
+            length_list.append(cur_len)
+        return length_list * self.repeats
+    def __len__(self):
+        return len(self.text_data) * self.repeats
+    def real_len(self):
+        return len(self.text_data)
+    def region_processor(self, orig_size, post_size, bboxes, labels):
+        orig_h, orig_w = orig_size
+        post_h, post_w = post_size
+        y_scale = post_h / orig_h
+        x_scale = post_w / orig_w
+        shuffle_ids = torch.randperm(len(labels))[:self.num_classes_per_sample]
+        selected_bboxes = bboxes[shuffle_ids]
+        # Ensure selected_bboxes is two-dimensional
+        if len(selected_bboxes.shape) == 1:
+            selected_bboxes = np.expand_dims(selected_bboxes, axis=0)
+        selected_labels = [labels[i] for i in shuffle_ids]
+        selected_bboxes[:, [0, 2]] *= x_scale
+        selected_bboxes[:, [1, 3]] *= y_scale
+        selected_bboxes = torch.tensor(
+            selected_bboxes, dtype=torch.float32) / post_h
+        return selected_bboxes, selected_labels
+    def _parse_annotations(self, img_info):
+        data_dict = {}
+        bboxes, captions = [], []
+        ann_info = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_info['id']))
+        image_path = os.path.join(self.image_folder, img_info['file_name'])
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(
+                g_image).permute(2, 0, 1).contiguous()
+            data_dict['g_pixel_values'] = g_pixel_values
+        orig_w, orig_h = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(
+            image, return_tensors='pt')['pixel_values'][0]
+        post_h, post_w = image.shape[1:3]
+        data_dict['pixel_values'] = image
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['area'] <= 0 or ann['bbox'][2] < 1 or ann['bbox'][3] < 1:
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, orig_w) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, orig_h) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if bbox:
+                bboxes.append(bbox)
+                captions.append(img_info['caption'])
+        if len(bboxes) == 0:
+            return self.__getitem__(0)
+        bboxes = np.array(bboxes, dtype=np.float32)
+        seg_map = img_info['file_name'].replace('jpg', 'png')
+        bboxes, captions = self.region_processor((orig_h, orig_w), (post_h, post_w), bboxes, captions)
+        data_dict['bboxes'] = bboxes
+        data_dict['captions'] = captions
+        data_dict['seg_map'] = seg_map
+        return data_dict
+    def create_conversation(self, captions):
+        questions = []
+        answers = []
+        for i, label in enumerate(captions):
+            question = random.choice(self.question_templates).strip().replace('<region>', f'region{i + 1} <bbox>')
+            questions.append(question)
+            answers.append(label)
+        conversation = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conversation.append({'input': question, 'output': answer})
+        return conversation
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = {}
+        ann_info = copy.deepcopy(self.text_data[index])
+        ann_info = self._parse_annotations(ann_info)
+        data_dict['g_pixel_values'] = ann_info.pop('g_pixel_values', None)
+        data_dict['pixel_values'] = ann_info.pop('pixel_values')
+        data_dict['bboxes'] = ann_info.pop('bboxes', None)
+        conversation = self.create_conversation(ann_info['captions'])
+        data_dict['conversation'] = conversation
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = encode_fn(data_dict, tokenizer=self.tokenizer,
+                           max_length=self.max_length, with_image_token=True)
+        data_dict.update(result)
+        return data_dict
+class RefCocoGRegionDataset(RegionDataset):
+    pass
+class VisualGenomeRegionDataset(RegionDataset):
+    def _parse_annotations(self, img_info):
+        data_dict = {}
+        bboxes, captions = [], []
+        ann_info = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_info['id']))
+        image_path = os.path.join(self.image_folder, img_info['file_name'])
+        image = Image.open(image_path).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(
+                g_image).permute(2, 0, 1).contiguous()
+            data_dict['g_pixel_values'] = g_pixel_values
+        orig_w, orig_h = image.size
+        if self.pad_image_to_square:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(
+            image, return_tensors='pt')['pixel_values'][0]
+        post_h, post_w = image.shape[1:3]
+        data_dict['pixel_values'] = image
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['area'] <= 0 or ann['bbox'][2] < 1 or ann['bbox'][3] < 1:
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, orig_w) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, orig_h) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if bbox:
+                bboxes.append(bbox)
+                captions.append(ann['caption'].strip())
+        if len(bboxes) == 0:
+            return self.__getitem__(0)
+        bboxes = np.array(bboxes, dtype=np.float32)
+        seg_map = img_info['file_name'].replace('jpg', 'png')
+        bboxes, captions = self.region_processor((orig_h, orig_w), (post_h, post_w), bboxes, captions)
+        data_dict['bboxes'] = bboxes
+        data_dict['captions'] = captions
+        data_dict['seg_map'] = seg_map
+        return data_dict
+if __name__ == '__main__':
+    from transformers import CLIPImageProcessor, AutoTokenizer
+    from third_parts.segment_anything.utils.transforms import ResizeLongestSide
+    pretrained_model = 'MBZUAI/GLaMM-GranD-Pretrained'
+    llm_name_or_path = 'lmsys/vicuna-7b-v1.5'
+    tokenizer = dict(
+        type=AutoTokenizer.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path)
+    image_processor = dict(
+        type=CLIPImageProcessor.from_pretrained,
+        pretrained_model_name_or_path='openai/clip-vit-large-patch14-336')
+    extra_image_processor = dict(
+        type=ResizeLongestSide,
+        target_length=1024,
+    )
+    from xtuner.utils.templates import PROMPT_TEMPLATE
+    prompt_template = PROMPT_TEMPLATE.vicuna
+    from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory, template_map_fn
+    from projects.glamm.datasets.collate_fns.glamm_collate_fn import glamm_collate_fn
+    dataset = VisualGenomeRegionDataset(
+        image_folder='./data/visual_genome/images',
+        image_processor=image_processor,
+        data_path='data/visual_genome/train.json',
+        tokenizer=tokenizer,
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template),
+        max_length=2048,
+        pad_image_to_square=False,
+        repeats=1,
+        num_classes_per_sample=3,
+        extra_image_processor=None)
+    for i in range(1000):
+        print(dataset[i])

projects/glamm/datasets/semantic_seg_dataset.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from xtuner.registry import BUILDER
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import SEG_QUESTIONS, ANSWER_LIST
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+class SemanticSegDataset(Dataset):
+    def __init__(self,
+                 image_folder,
+                 image_processor,
+                 data_path=None,
+                 tokenizer=None,
+                 offline_processed_text_folder=None,
+                 max_dataset_length=None,
+                 dataset_map_fn=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 num_proc=8,
+                 lazy=False,
+                 repeats=1,
+                 gcg_format=False,
+                 num_classes_per_sample=3,
+                 extra_image_processor=None):
+        super().__init__()
+        self.gcg_format = gcg_format
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.num_classes_per_sample = num_classes_per_sample
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+        reg_tokens = ['<bbox>', '<point>']
+        segmentation_tokens = ['[SEG]']
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        assert offline_processed_text_folder or (data_path and tokenizer)
+        self.lazy = lazy
+        self.max_length = max_length
+        self.dataset_map_fn = dataset_map_fn
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if offline_processed_text_folder and data_path:
+            print_log(
+                'Both `offline_processed_text_folder` and '
+                '`data_path` are set, and we load dataset from'
+                '`offline_processed_text_folder` '
+                f'({offline_processed_text_folder})',
+                logger='current',
+                level=logging.WARNING)
+        if offline_processed_text_folder is not None:
+            raise NotImplementedError
+        else:
+            self.image_label_datas = self.json_file_preprocess(data_path, image_folder)
+        self.image_folder = image_folder
+        if isinstance(image_processor, dict) or isinstance(image_processor, Config) or isinstance(image_processor, ConfigDict):
+            self.image_processor = BUILDER.build(image_processor)
+        else:
+            self.image_processor = image_processor
+        size = self.image_processor.crop_size
+        if isinstance(size, dict):
+            self.image_w, self.image_h = size['width'], size['height']
+        elif isinstance(size, int):
+            self.image_h, self.image_w = size, size
+        else:
+            self.image_w, self.image_h = size
+        self.pad_image_to_square = pad_image_to_square
+        self.down_ratio = 1
+        self.repeats = repeats
+    def json_file_preprocess(self, data_path, image_folder):
+        # ade20k
+        with open(data_path, 'r') as file:
+            ade20k_classes = json.load(file)
+        ade20k_image_dir = image_folder
+        ade20k_images = [os.path.join(ade20k_image_dir, img) for img in os.listdir(ade20k_image_dir) if
+                         img.endswith('.jpg')]
+        ade20k_labels = [img.replace(".jpg", ".png").replace(
+            "images", "annotations") for img in ade20k_images]
+        self.classes = np.array(ade20k_classes)
+        ret = []
+        for image, label in zip(ade20k_images, ade20k_labels):
+            ret.append({"image": image, "label": label})
+        return ret
+    def __len__(self):
+        return len(self.image_label_datas) * self.repeats
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.image_label_datas:
+            length_list.append(100)
+        length_list = length_list * self.repeats
+        return length_list
+    def real_len(self):
+        return len(self.image_label_datas)
+    def decode_mask(self, label_path):
+        label = np.array(Image.open(label_path))
+        # ade20k
+        label = np.where(label == 0, 255, label - 1)
+        unique_labels = [lbl for lbl in np.unique(label) if lbl != 255]
+        if not unique_labels:
+            return None, None
+        selected_labels = np.random.choice(unique_labels, min(
+            len(unique_labels), self.num_classes_per_sample), replace=False)
+        label = torch.from_numpy(label).long()
+        masks = torch.stack([label == class_id for class_id in selected_labels], dim=0)
+        return masks, selected_labels
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = copy.deepcopy(self.image_label_datas[index])
+        assert 'image' in data_dict.keys()
+        if data_dict.get('image', None) is not None:
+            image_file = data_dict['image']
+            image = Image.open(image_file).convert('RGB')
+            if hasattr(self, 'extra_image_processor'):
+                g_image = np.array(image) # for grounding
+                g_image = self.extra_image_processor.apply_image(g_image)
+                g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+                data_dict['g_pixel_values'] = g_pixel_values
+            ori_width, ori_height = image.size
+            if self.pad_image_to_square:
+                image = expand2square(image, tuple(int(x * 255)
+                                      for x in self.image_processor.image_mean))
+            image = self.image_processor.preprocess(
+                image, return_tensors='pt')['pixel_values'][0]
+            data_dict['pixel_values'] = image
+            # process and get masks
+            data_dict['masks'], class_id = self.decode_mask(data_dict['label'])
+            if class_id is None:
+                return self.__getitem__(0)
+            if self.gcg_format:
+                pass
+            else:
+                conversation = []
+                for i, c_id in enumerate(class_id):
+                    question = random.choice(SEG_QUESTIONS).format(
+                        class_name=self.classes[c_id].lower())
+                    if i == 0:
+                        question = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n""" + question
+                    conversation.append(
+                        {'input': question, 'output': random.choice(ANSWER_LIST)})
+            data_dict.update({'conversation': conversation})
+        else:
+            if hasattr(self.image_processor, 'crop_size'):
+                crop_size = self.image_processor.crop_size
+            else:
+                crop_size = self.image_processor.size
+            data_dict['pixel_values'] = torch.zeros(3, crop_size['height'],
+                                                    crop_size['width'])
+            data_dict['masks'] = None
+        if self.lazy:
+            result = self.template_map_fn(data_dict)
+            data_dict.update(result)
+            result = encode_fn(data_dict, tokenizer=self.tokenizer,
+                               max_length=self.max_length, with_image_token=True)
+            data_dict.update(result)
+        return data_dict
+class ADE20kSemanticSegDataset(SemanticSegDataset):
+    def __init__(self,
+                 image_folder,
+                 image_processor,
+                 data_path=None,
+                 tokenizer=None,
+                 offline_processed_text_folder=None,
+                 max_dataset_length=None,
+                 dataset_map_fn=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 num_proc=8,
+                 lazy=False,
+                 repeats=1,
+                 gcg_format=False,
+                 num_classes_per_sample=3,
+                 extra_image_processor=None):
+        super().__init__(
+            image_folder=image_folder,
+            image_processor=image_processor,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            offline_processed_text_folder=offline_processed_text_folder,
+            max_dataset_length=max_dataset_length,
+            dataset_map_fn=dataset_map_fn,
+            template_map_fn=template_map_fn,
+            max_length=max_length,
+            pad_image_to_square=pad_image_to_square,
+            num_proc=num_proc,
+            lazy=lazy,
+            repeats=repeats,
+            gcg_format=gcg_format,
+            num_classes_per_sample=num_classes_per_sample,
+            extra_image_processor=extra_image_processor,
+        )
+class COCOStuffSemanticSegDataset(SemanticSegDataset):
+    def __init__(self,
+                 image_folder,
+                 image_processor,
+                 data_path=None,
+                 tokenizer=None,
+                 offline_processed_text_folder=None,
+                 max_dataset_length=None,
+                 dataset_map_fn=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 pad_image_to_square=False,
+                 num_proc=8,
+                 lazy=False,
+                 repeats=1,
+                 label_path=None,
+                 gcg_format=False,
+                 num_classes_per_sample=3,
+                 extra_image_processor=None):
+        self.label_path = label_path
+        super().__init__(
+            image_folder=image_folder,
+            image_processor=image_processor,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            offline_processed_text_folder=offline_processed_text_folder,
+            max_dataset_length=max_dataset_length,
+            dataset_map_fn=dataset_map_fn,
+            template_map_fn=template_map_fn,
+            max_length=max_length,
+            pad_image_to_square=pad_image_to_square,
+            num_proc=num_proc,
+            lazy=lazy,
+            repeats=repeats,
+            gcg_format=gcg_format,
+            num_classes_per_sample=num_classes_per_sample,
+            extra_image_processor=extra_image_processor,
+        )
+        self.cocostuff_class2index = {c: i for i, c in enumerate(self.classes)}
+    def json_file_preprocess(self, data_path, image_folder):
+        # coco stuff
+        assert self.label_path is not None
+        with open(data_path, 'r') as file:
+            cocostuff_classes = [line.strip().split(": ")[-1]
+                                 for line in file.readlines()[1:]]
+        coco_stuff_image_dir = image_folder
+        coco_stuff_label_dir = self.label_path
+        coco_stuff_labels = glob.glob(
+            os.path.join(coco_stuff_label_dir, "*.png"))
+        coco_stuff_images = [label.replace(".png", ".jpg").replace(coco_stuff_label_dir, coco_stuff_image_dir)
+                             for label in coco_stuff_labels]
+        self.classes = np.array(cocostuff_classes)
+        ret = []
+        for image, label in zip(coco_stuff_images, coco_stuff_labels):
+            ret.append({"image": image, "label": label})
+        return ret
+    def decode_mask(self, label_path):
+        label = np.array(Image.open(label_path))
+        # coco stuff
+        ignored_classes = [index for class_name,
+                           index in self.cocostuff_class2index.items() if "-" in class_name]
+        label = np.where(np.isin(label, ignored_classes), 255, label)
+        unique_labels = [lbl for lbl in np.unique(label) if lbl != 255]
+        if not unique_labels:
+            print("No valid label !!!")
+            return None, None
+        # only choose 1
+        selected_labels = np.random.choice(unique_labels, min(
+            len(unique_labels), self.num_classes_per_sample), replace=False)
+        label = torch.from_numpy(label).long()
+        masks = torch.stack(
+            [label == class_id for class_id in selected_labels], dim=0)
+        return masks, selected_labels
+class PascalPartSemanticSegDataset(SemanticSegDataset):
+    def json_file_preprocess(self, data_path, image_folder):
+        self.coco_api = COCO(data_path)
+        img_ids = self.coco_api.getImgIds()
+        all_classes = self.coco_api.loadCats(self.coco_api.getCatIds())
+        class_map_pascal_part = {}
+        for cat in all_classes:
+            cat_main, cat_part = cat["name"].strip().split(":")
+            name = (cat_main, cat_part)
+            class_map_pascal_part[cat["id"]] = name
+        self.classes = class_map_pascal_part
+        return img_ids
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        img_id = self.image_label_datas[index]
+        img_info = self.coco_api.loadImgs([img_id])[0]
+        file_name = img_info["file_name"]
+        data_dict = {}
+        image_file = os.path.join(self.image_folder, file_name)
+        image = Image.open(image_file).convert('RGB')
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            data_dict['g_pixel_values'] = g_pixel_values
+        if self.pad_image_to_square:
+            image = expand2square(
+                image,  tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        data_dict['pixel_values'] = image
+        annotation_ids = self.coco_api.getAnnIds(imgIds=img_info["id"])
+        annotations = self.coco_api.loadAnns(annotation_ids)
+        if not annotations:
+            return self.__getitem__(0)
+        sampled_anns = np.random.choice(annotations, min(
+            len(annotations), self.num_classes_per_sample), replace=False)
+        conversation = []
+        for i, ann in enumerate(sampled_anns):
+            cat_id = ann['category_id']
+            sampled_cls = self.classes[cat_id]
+            if isinstance(sampled_cls, tuple):
+                obj, part = sampled_cls
+                name = f"{obj} {part}" if random.random() < 0.5 else f"the {part} of the {obj}"
+            else:
+                name = sampled_cls
+            question = random.choice(SEG_QUESTIONS).format(class_name=name)
+            if i == 0:
+                question = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n""" + question
+            conversation.append(
+                {'input': question, 'output': random.choice(ANSWER_LIST)})
+        masks = [self.coco_api.annToMask(ann) for ann in sampled_anns]
+        masks = np.stack(masks, axis=0)
+        masks = torch.from_numpy(masks)
+        data_dict['masks'] = masks
+        data_dict['conversation'] = conversation
+        if self.lazy:
+            result = self.template_map_fn(data_dict)
+            data_dict.update(result)
+            result = encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length, with_image_token=True)
+            data_dict.update(result)
+        return data_dict
+class PacoSemanticSegDataset(PascalPartSemanticSegDataset):
+    def json_file_preprocess(self, data_path, image_folder):
+        self.coco_api = COCO(data_path)
+        all_classes = self.coco_api.loadCats(self.coco_api.getCatIds())
+        class_map_paco = {}
+        for cat in all_classes:
+            cat_split = cat["name"].strip().split(":")
+            if len(cat_split) == 1:
+                name = cat_split[0].split("_(")[0]
+            else:
+                assert len(cat_split) == 2
+                obj, part = cat_split
+                obj = obj.split("_(")[0]
+                part = part.split("_(")[0]
+                name = (obj, part)
+            class_map_paco[cat["id"]] = name
+        self.classes = class_map_paco
+        return self.coco_api.getImgIds()

projects/glamm/datasets/utils/ade20k_classes.json ADDED Viewed

	@@ -0,0 +1,30 @@

+[
+    "wall", "building", "sky", "floor", "tree", "ceiling", "road",
+    "bed", "windowpane", "grass", "cabinet", "sidewalk",
+    "person", "earth", "door", "table", "mountain", "plant",
+    "curtain", "chair", "car", "water", "painting", "sofa",
+    "shelf", "house", "sea", "mirror", "rug", "field", "armchair",
+    "seat", "fence", "desk", "rock", "wardrobe", "lamp",
+    "bathtub", "railing", "cushion", "base", "box", "column",
+    "signboard", "chest of drawers", "counter", "sand", "sink",
+    "skyscraper", "fireplace", "refrigerator", "grandstand",
+    "path", "stairs", "runway", "case", "pool table", "pillow",
+    "screen door", "stairway", "river", "bridge", "bookcase",
+    "blind", "coffee table", "toilet", "flower", "book", "hill",
+    "bench", "countertop", "stove", "palm", "kitchen island",
+    "computer", "swivel chair", "boat", "bar", "arcade machine",
+    "hovel", "bus", "towel", "light", "truck", "tower",
+    "chandelier", "awning", "streetlight", "booth",
+    "television receiver", "airplane", "dirt track", "apparel",
+    "pole", "land", "bannister", "escalator", "ottoman", "bottle",
+    "buffet", "poster", "stage", "van", "ship", "fountain",
+    "conveyer belt", "canopy", "washer", "plaything",
+    "swimming pool", "stool", "barrel", "basket", "waterfall",
+    "tent", "bag", "minibike", "cradle", "oven", "ball", "food",
+    "step", "tank", "trade name", "microwave", "pot", "animal",
+    "bicycle", "lake", "dishwasher", "screen", "blanket",
+    "sculpture", "hood", "sconce", "vase", "traffic light",
+    "tray", "ashcan", "fan", "pier", "crt screen", "plate",
+    "monitor", "bulletin board", "shower", "radiator", "glass",
+    "clock", "flag"
+]

projects/glamm/datasets/utils/cocostuff_classes.txt ADDED Viewed

	@@ -0,0 +1,183 @@

+0: unlabeled
+1: person
+2: bicycle
+3: car
+4: motorcycle
+5: airplane
+6: bus
+7: train
+8: truck
+9: boat
+10: traffic light
+11: fire hydrant
+12: street sign
+13: stop sign
+14: parking meter
+15: bench
+16: bird
+17: cat
+18: dog
+19: horse
+20: sheep
+21: cow
+22: elephant
+23: bear
+24: zebra
+25: giraffe
+26: hat
+27: backpack
+28: umbrella
+29: shoe
+30: eye glasses
+31: handbag
+32: tie
+33: suitcase
+34: frisbee
+35: skis
+36: snowboard
+37: sports ball
+38: kite
+39: baseball bat
+40: baseball glove
+41: skateboard
+42: surfboard
+43: tennis racket
+44: bottle
+45: plate
+46: wine glass
+47: cup
+48: fork
+49: knife
+50: spoon
+51: bowl
+52: banana
+53: apple
+54: sandwich
+55: orange
+56: broccoli
+57: carrot
+58: hot dog
+59: pizza
+60: donut
+61: cake
+62: chair
+63: couch
+64: potted plant
+65: bed
+66: mirror
+67: dining table
+68: window
+69: desk
+70: toilet
+71: door
+72: tv
+73: laptop
+74: mouse
+75: remote
+76: keyboard
+77: cell phone
+78: microwave
+79: oven
+80: toaster
+81: sink
+82: refrigerator
+83: blender
+84: book
+85: clock
+86: vase
+87: scissors
+88: teddy bear
+89: hair drier
+90: toothbrush
+91: hair brush
+92: banner
+93: blanket
+94: branch
+95: bridge
+96: building-other
+97: bush
+98: cabinet
+99: cage
+100: cardboard
+101: carpet
+102: ceiling-other
+103: ceiling-tile
+104: cloth
+105: clothes
+106: clouds
+107: counter
+108: cupboard
+109: curtain
+110: desk-stuff
+111: dirt
+112: door-stuff
+113: fence
+114: floor-marble
+115: floor-other
+116: floor-stone
+117: floor-tile
+118: floor-wood
+119: flower
+120: fog
+121: food-other
+122: fruit
+123: furniture-other
+124: grass
+125: gravel
+126: ground-other
+127: hill
+128: house
+129: leaves
+130: light
+131: mat
+132: metal
+133: mirror-stuff
+134: moss
+135: mountain
+136: mud
+137: napkin
+138: net
+139: paper
+140: pavement
+141: pillow
+142: plant-other
+143: plastic
+144: platform
+145: playingfield
+146: railing
+147: railroad
+148: river
+149: road
+150: rock
+151: roof
+152: rug
+153: salad
+154: sand
+155: sea
+156: shelf
+157: sky
+158: skyscraper
+159: snow
+160: solid-other
+161: stairs
+162: stone
+163: straw
+164: structural-other
+165: table
+166: tent
+167: textile-other
+168: towel
+169: tree
+170: vegetable
+171: wall-brick
+172: wall-concrete
+173: wall-other
+174: wall-panel
+175: wall-stone
+176: wall-tile
+177: wall-wood
+178: water-other
+179: waterdrops
+180: window-blind
+181: window-other
+182: wood

projects/glamm/datasets/utils/utils.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from PIL import Image
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+CAPTION_QUESTIONS = [
+    'Could you please give me a detailed description of the image?',
+    'Can you provide a thorough description of the this image?',
+    'Please provide a thorough description of the this image',
+    'Please provide a thorough description of the this image.',
+    'Please describe in detail the contents of the image.',
+    'Please describe in detail the contents of the image',
+    'Could you give a comprehensive explanation of what can be found within this picture?',
+    'Could you give me an elaborate explanation of this picture?',
+    'Could you provide me with a detailed analysis of this photo?',
+    'Could you please give me a detailed description of the image?',
+    'Can you provide a thorough description of the this image?',
+    'Please describe in detail the contents of the image',
+    'Please describe in detail the contents of the image.',
+    'Can you give a comprehensive explanation of this photo',
+    'Please provide an elaborate explanation of this picture.',
+    'Please provide an elaborate explanation of this picture',
+    'Could you provide me with a detailed analysis of this photo',
+]
+REGION_QUESTIONS = [
+    'Can you provide me with a detailed description of the region in the picture marked by <region>?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail?",
+    'What can you tell me about the region indicated by <region> in the image?',
+    "I'd like to know more about the area in the photo labeled <region>. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail?',
+    'What details can you give me about the region outlined by <region> in the photo?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail?",
+    'What is the region outlined by <region> in the picture like? Could you give me a detailed description?',
+    'Can you provide me with a detailed description of the region in the picture marked by <region>, please?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail, please?",
+    'What can you tell me about the region indicated by <region> in the image, exactly?',
+    "I'd like to know more about the area in the photo labeled <region>, please. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail, please?',
+    'What details can you give me about the region outlined by <region> in the photo, please?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image, please.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture, please?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail, please?",
+    'What is the region outlined by <region> in the picture like, please? Could you give me a detailed description?',
+]
+REGION_GROUP_QUESTIONS = [
+    'Could you please give me a detailed description of these areas <region>?',
+    'Can you provide a thorough description of the regions <region> in this image?',
+    'Please describe in detail the contents of the boxed areas <region>.',
+    'Could you give a comprehensive explanation of what can be found within <region> in the picture?',
+    'Could you give me an elaborate explanation of the <region> regions in this picture?',
+    'Can you provide a comprehensive description of the areas identified by <region> in this photo?',
+    'Help me understand the specific locations labeled <region> in this picture in detail, please.',
+    'What is the detailed information about the areas marked by <region> in this image?',
+    'Could you provide me with a detailed analysis of the regions designated <region> in this photo?',
+    'What are the specific features of the areas marked <region> in this picture that you can describe in detail?',
+    'Could you elaborate on the regions identified by <region> in this image?',
+    'What can you tell me about the areas labeled <region> in this picture?',
+    'Can you provide a thorough analysis of the specific locations designated <region> in this photo?',
+    'I am interested in learning more about the regions marked <region> in this image. Can you provide me with more information?',
+    'Could you please provide a detailed description of the areas identified by <region> in this photo?',
+    'What is the significance of the regions labeled <region> in this picture?',
+    'I would like to know more about the specific locations designated <region> in this image. Can you provide me with more information?',
+    'Can you provide a detailed breakdown of the regions marked <region> in this photo?',
+    'What specific features can you tell me about the areas identified by <region> in this picture?',
+    'Could you please provide a comprehensive explanation of the locations labeled <region> in this image?',
+    'Can you provide a detailed account of the regions designated <region> in this photo?',
+    'I am curious about the areas marked <region> in this picture. Can you provide me with a detailed analysis?',
+    'What important details can you tell me about the specific locations identified by <region> in this image?',
+    'Could you please provide a detailed description of the regions labeled <region> in this photo?',
+    'What can you tell me about the features of the areas designated <region> in this picture?',
+    'Can you provide a comprehensive overview of the regions marked <region> in this image?',
+    'I would like to know more about the specific locations identified by <region> in this photo. Can you provide me with more information?',
+    'What is the detailed information you have on the areas labeled <region> in this picture?',
+    'Could you provide me with a thorough analysis of the regions designated <region> in this image?',
+    'Can you provide a detailed explanation of the specific locations marked by <region> in this photo?'
+]
+GCG_QUESTIONS = [
+    'Could you please give me a detailed description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    'Can you provide a thorough description of the this image? Please output with interleaved segmentation masks for the corresponding phrases.',
+    'Please describe in detail the contents of the image. Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    'Could you give a comprehensive explanation of what can be found within this picture? Please output with interleaved segmentation masks for the corresponding phrases.',
+    'Could you give me an elaborate explanation of this picture? Please respond with interleaved segmentation masks for the corresponding phrases.',
+    'Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer.',
+]
+SEG_QUESTIONS = [
+    "Can you segment the {class_name} in this image?",
+    "Please segment {class_name} in this image.",
+    "What is {class_name} in this image? Please respond with segmentation mask.",
+    "What is {class_name} in this image? Please output segmentation mask.",
+    "Can you segment the {class_name} in this image",
+    "Please segment {class_name} in this image",
+    "What is {class_name} in this image? Please respond with segmentation mask",
+    "What is {class_name} in this image? Please output segmentation mask",
+    "Could you provide a segmentation mask for the {class_name} in this image?",
+    "Please identify and segment the {class_name} in this image.",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask.",
+    "Can you highlight the {class_name} in this image with a segmentation mask?",
+    "Could you provide a segmentation mask for the {class_name} in this image",
+    "Please identify and segment the {class_name} in this image",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask",
+    "Can you highlight the {class_name} in this image with a segmentation mask",
+]
+ANSWER_LIST = [
+    "It is [SEG].",
+    "Sure, [SEG].",
+    "Sure, it is [SEG].",
+    "Sure, the segmentation result is [SEG].",
+    "[SEG].",
+]

projects/glamm/models/glamm.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from xtuner.registry import BUILDER
+from xtuner.model.utils import LoadWoInit, guess_load_checkpoint
+from xtuner.model.llava import LLaVAModel
+from mmengine.model import BaseModel
+from mmengine import print_log
+from projects.glamm.utils import prepare_inputs_labels_for_multimodal
+from projects.glamm.utils import DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+class GLaMM(LLaVAModel):
+    def __init__(self,
+                 use_activation_checkpointing=True,
+                 tokenizer=None,
+                 grounding_encoder=None,
+                 region_encoder=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 *args, **kwargs):
+        super(GLaMM, self).__init__(
+            *args, use_activation_checkpointing=use_activation_checkpointing, **kwargs)
+        self.use_activation_checkpointing = use_activation_checkpointing
+        self.tokenizer = BUILDER.build(tokenizer)
+        self._add_special_tokens()
+        self.grounding_encoder = BUILDER.build(grounding_encoder)
+        self.grounding_encoder.requires_grad_(False)
+        self.grounding_encoder.mask_decoder.requires_grad_(True)
+        if region_encoder is not None:
+            self.region_encoder = BUILDER.build(region_encoder)
+        in_dim = self.config.hidden_size
+        out_dim = self.grounding_encoder.mask_decoder.transformer_dim
+        self.text_hidden_fcs = nn.Sequential(
+            nn.Linear(in_dim, in_dim), nn.ReLU(inplace=True),
+            nn.Linear(in_dim, out_dim), nn.Dropout(0.0)
+        )
+        self.loss_mask = BUILDER.build(loss_mask)
+        self.loss_dice = BUILDER.build(loss_dice)
+    def _add_special_tokens(self):
+        reg_tokens = ['<im_start>', '<im_end>', '<bbox>', '<point>']
+        segmentation_tokens = ['[SEG]']
+        phrase_tokens = ['<p>', '</p>']
+        special_tokens = reg_tokens + segmentation_tokens + phrase_tokens
+        num_new_tokens = self.tokenizer.add_tokens(
+            special_tokens, special_tokens=True)
+        if num_new_tokens > 0:
+            self.llm.resize_token_embeddings(len(self.tokenizer))
+            input_embeddings = self.llm.get_input_embeddings().weight.data
+            output_embeddings = self.llm.get_output_embeddings().weight.data
+            input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                dim=0, keepdim=True)
+            output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                dim=0, keepdim=True)
+            input_embeddings[-num_new_tokens:] = input_embeddings_avg
+            output_embeddings[-num_new_tokens:] = output_embeddings_avg
+        self.seg_token_idx = self.tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+        self.bop_token_idx = self.tokenizer("<p>", add_special_tokens=False).input_ids[0]
+        self.eop_token_idx = self.tokenizer("</p>", add_special_tokens=False).input_ids[0]
+        self.bbox_token_idx = self.tokenizer("<bbox>", add_special_tokens=False).input_ids[0]
+        if self.use_activation_checkpointing or self.use_llm_lora or not self.freeze_llm:
+            self.llm.enable_input_require_grads()
+    def forward(self, data, data_samples=None, mode='loss'):
+        if 'pixel_values' in data:
+            visual_outputs = self.visual_encoder(
+                data['pixel_values'].to(self.visual_encoder.dtype),
+                output_hidden_states=True)
+            pixel_values = self.projector(
+                visual_outputs.hidden_states[self.visual_select_layer][:, 1:])
+            data['pixel_values'] = pixel_values
+            bboxes = data.pop('bboxes', None)
+            if bboxes is not None:
+                select_hidden_state_layer = -2
+                num_level_reg_features = 4
+                mlvl_reg_features = visual_outputs.hidden_states[select_hidden_state_layer::-3]
+                mlvl_reg_features = mlvl_reg_features[::-1]
+                mlvl_reg_features = mlvl_reg_features[-num_level_reg_features:]
+                mlvl_reg_features = [item[:, 1:] for item in mlvl_reg_features]
+                mlvl_reg_features = self.region_encoder(mlvl_reg_features, bboxes)
+            data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data)
+            if bboxes is not None:
+                inputs_embeds = data['inputs_embeds']
+                for i, reg_feat in enumerate(mlvl_reg_features):
+                    reg_mask = data['new_input_ids'][i] == self.bbox_token_idx
+                    inputs_embeds[i][reg_mask] = reg_feat
+                data['inputs_embeds'] = inputs_embeds
+        if mode == 'loss':
+            return self.compute_loss(data, data_samples)
+        elif mode == 'predict':
+            return self.predict(data, data_samples)
+        elif mode == 'tensor':
+            return self._forward(data, data_samples)
+        else:
+            raise NotImplementedError
+    def compute_loss(self, data, data_samples=None):
+        g_pixel_values = data.pop('g_pixel_values', None)
+        gt_masks = data.pop('masks', None)
+        new_input_ids = data.pop('new_input_ids', None)
+        output = self.llm(output_hidden_states=True, **data)
+        if gt_masks is None:
+            return {'llm_loss': output.loss}
+        resize_list = [pixel.shape[-2:] for pixel in g_pixel_values]
+        ori_size_list = [mask.shape[-2:] for mask in gt_masks]
+        g_pixel_values = torch.stack([
+            self.grounding_encoder.preprocess(pixel) for pixel in g_pixel_values
+        ])
+        image_embeddings = self.grounding_encoder.image_encoder(g_pixel_values)
+        seg_token_mask = new_input_ids == self.seg_token_idx
+        hidden_states = output.hidden_states
+        hidden_states = self.text_hidden_fcs(hidden_states[-1])
+        pred_embeddings = hidden_states[seg_token_mask]
+        seg_token_counts = seg_token_mask.int().sum(-1)
+        pred_embeddings_list = torch.split(pred_embeddings, seg_token_counts.tolist(), dim=0)
+        pred_masks = self._generate_and_postprocess_masks(
+            pred_embeddings_list, image_embeddings, resize_list, ori_size_list)
+        bs = len(pred_masks)
+        loss_mask, loss_dice = 0, 0
+        for i in range(bs):
+            pred_mask = pred_masks[i]
+            gt_mask = gt_masks[i]
+            sam_loss_mask = self.loss_mask(pred_mask, gt_mask)
+            sam_loss_dice = self.loss_dice(pred_mask, gt_mask)
+            accuracy = torch.eq((pred_mask.sigmoid() > 0.5), gt_mask).to(pred_mask).mean()
+            loss_mask += sam_loss_mask
+            loss_dice += sam_loss_dice
+        loss_dict = {
+            'loss_mask': loss_mask / bs,
+            'loss_dice': loss_dice / bs,
+            'accuracy': accuracy,
+            'llm_loss': output.loss,
+        }
+        return loss_dict
+    def _generate_and_postprocess_masks(self, pred_embeddings, image_embeddings, resize_list=None, orig_size_list=None, infer=False):
+        pred_masks = []
+        for i, pred_embedding in enumerate(pred_embeddings):
+            sparse_embeddings, dense_embeddings = self.grounding_encoder.prompt_encoder(
+                points=None, boxes=None, masks=None, text_embeds=pred_embedding.unsqueeze(1)
+            )
+            sparse_embeddings = sparse_embeddings.to(pred_embedding.dtype)
+            low_res_masks, _ = self.grounding_encoder.mask_decoder(
+                image_embeddings=image_embeddings[i].unsqueeze(0),
+                image_pe=self.grounding_encoder.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings, dense_prompt_embeddings=dense_embeddings,
+                multimask_output=False, )
+            pred_mask = self.grounding_encoder.postprocess_masks(
+                low_res_masks, input_size=resize_list[i], original_size=orig_size_list[i], )
+            pred_masks.append(pred_mask[:, 0])
+        return pred_masks
+    def predict(self, data):
+        pass
+    def _forward(self, data, dta_samples=None):
+        outputs = self.llm(**data)
+        return outputs

projects/glamm/models/region_encoder.py ADDED Viewed

	@@ -0,0 +1,359 @@

+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+from torch import Tensor
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv import ops
+from mmcv.cnn import ConvModule, Linear
+from mmengine.model import BaseModule
+class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (list[int]): Strides of input feature maps.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+    def __init__(self,
+                 roi_layer,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 init_cfg=None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+    @property
+    def num_inputs(self) -> int:
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+    def build_roi_layers(self, layer_cfg,
+                         featmap_strides: List[int]) -> nn.ModuleList:
+        """Build RoI operator to extract feature from each level feature map.
+        Args:
+            layer_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+                config RoI layer operation. Options are modules under
+                ``mmcv/ops`` such as ``RoIAlign``.
+            featmap_strides (list[int]): The stride of input feature map w.r.t
+                to the original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+        Returns:
+            :obj:`nn.ModuleList`: The RoI extractor modules for each level
+                feature map.
+        """
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        if isinstance(layer_type, str):
+            assert hasattr(ops, layer_type)
+            layer_cls = getattr(ops, layer_type)
+        else:
+            layer_cls = layer_type
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+    def roi_rescale(self, rois: Tensor, scale_factor: float) -> Tensor:
+        """Scale RoI coordinates by scale factor.
+        Args:
+            rois (Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+        Returns:
+            Tensor: Scaled RoI.
+        """
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+    @abstractmethod
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+        Returns:
+            Tensor: RoI feature.
+        """
+        pass
+class MLVLFuseModule(nn.Module):
+    def __init__(self, input_dims=1024, embed_dims=1024, num_levels=3, num_fuse=4):
+        super(MLVLFuseModule, self).__init__()
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_fuse = num_fuse
+        self.input_dims = input_dims
+        self.shuffle_channles = embed_dims // 4
+        # contains the tuple of level indices that will do the interaction
+        self.fuse_lvl_list = []
+        num_levels = self.num_levels
+        for lvl in range(num_levels):
+            top_lvl = min(lvl + 1, num_levels - 1)
+            dow_lvl = max(lvl - 1, 0)
+            tar_lvl = lvl
+            self.fuse_lvl_list.append((tar_lvl, top_lvl, dow_lvl))
+        self.remain_chs = self.embed_dims - self.shuffle_channles * 2
+        self._init_layers()
+    def generate_coordinate(self, featmap_sizes, device='cuda'):
+        x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device)
+        y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        y = y.expand([featmap_sizes[0], 1, -1, -1])
+        x = x.expand([featmap_sizes[0], 1, -1, -1])
+        coord_feat = torch.cat([x, y], 1)
+        return coord_feat
+    def _init_layers(self):
+        self.input_conv = nn.ModuleList([nn.Conv2d(self.input_dims + 2,
+                                                   self.embed_dims, 1)
+                                         for _ in range(self.num_levels)])
+        self.fuse_convs = nn.ModuleList()
+        for i in range(self.num_fuse):
+            self.fuse_convs.append(
+                ConvModule(self.embed_dims,
+                           self.embed_dims,
+                           3,
+                           stride=1,
+                           padding=3 // 2,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='GN',
+                                         num_groups=64,
+                                         requires_grad=True)
+                           ))
+    def init_weights(self):
+        pass
+    def _single_shuffle(self, inputs, conv_module):
+        if not isinstance(conv_module, (nn.ModuleList, list)):
+            conv_module = [conv_module]
+        for single_conv_m in conv_module:
+            fused_inputs = []
+            for fuse_lvl_tuple in self.fuse_lvl_list:
+                tar_lvl, top_lvl, dow_lvl = fuse_lvl_tuple
+                tar_input = inputs[tar_lvl]
+                top_input = inputs[top_lvl]
+                down_input = inputs[dow_lvl]
+                remain = tar_input[:, :self.remain_chs]
+                from_top = top_input[:, self.remain_chs:][:, self.shuffle_channles:]
+                from_top = F.interpolate(from_top.to(torch.float32),
+                                         size=tar_input.shape[-2:],
+                                         mode='bilinear',
+                                         align_corners=True)
+                from_down = down_input[:, self.remain_chs:][:, :self.shuffle_channles]
+                from_down = F.interpolate(from_down.to(torch.float32),
+                                          size=tar_input.shape[-2:],
+                                          mode='bilinear',
+                                          align_corners=True)
+                fused_inputs.append(
+                    torch.cat([remain, from_top.to(remain.dtype), from_down.to(remain.dtype)], dim=1))
+            fused_inputs = [single_conv_m(item) for item in fused_inputs]
+            inputs = fused_inputs
+        return inputs
+    def forward(self, inputs, ):
+        feat_size = [item.shape for item in inputs]
+        new_inputs = []
+        for feat, single_feat_size in zip(inputs, feat_size):
+            coord_feat = self.generate_coordinate(
+                single_feat_size, device=inputs[0].device)
+            # feat = torch.cat([feat, coord_feat], dim=1)
+            feat = torch.cat([feat, coord_feat.to(feat.dtype)], dim=1)
+            new_inputs.append(feat)
+        inputs = new_inputs
+        inputs = [self.input_conv[lvl](item)
+                  for lvl, item in enumerate(inputs)]
+        for conv_m in self.fuse_convs:
+            inputs = self._single_shuffle(inputs, [conv_m])
+        return inputs
+class MlvlRoIExtractor(BaseRoIExtractor):
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 embed_dims=1024,
+                 stride=1,
+                 norm_init=True,
+                 fuse_level=3,
+                 finest_scale=56,
+                 init_cfg=None):
+        super(MlvlRoIExtractor, self).__init__(roi_layer, out_channels,
+                                               featmap_strides, init_cfg)
+        self.embed_dims = embed_dims
+        self.finest_scale = finest_scale
+        self.fuse_level = fuse_level
+        self.norm_init = norm_init
+        self.pconvs = nn.ModuleList(
+            nn.Conv2d(self.embed_dims, self.embed_dims, 3, stride=1, padding=1)
+            for _ in range(self.fuse_level))
+        self.pos_embedd = nn.Sequential(
+            nn.Linear(4, 256),
+            nn.ReLU(inplace=True),
+            nn.LayerNorm(256),
+            nn.Linear(256, 1024),
+            nn.ReLU(inplace=True),
+            nn.LayerNorm(1024),
+        )
+        self.updims = nn.Linear(1024, 4096)
+        self.flatten_linear = nn.Linear(
+            self.embed_dims * self.roi_layers[0].output_size[0] ** 2, 1024)
+        self.norm_init_weights()
+    #  self.dtype = torch.float32
+    def norm_init_weights(self):
+        pass
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        num_imgs = len(rois)
+        # feats = [item for item in feats]
+        batch_rois = torch.cat(rois, dim=0).to(feats[0].dtype)
+        pos_embedd = self.pos_embedd(batch_rois)
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        if feats[0].dim() == 3:
+            h = w = int(math.sqrt(feats[0].shape[1]))
+            assert h == 16
+            assert w == 16
+            b, c = feats[0].shape[0], feats[0].shape[-1]
+            feats = [item.reshape(b, h, w, c).permute(0, 3, 1, 2)
+                     for item in feats]
+        new_rois = []
+        for img_id, single_img_roi in enumerate(rois):
+            # rescale to original img scale
+            single_img_roi = single_img_roi * 224
+            roi_img_id = single_img_roi.new_ones(len(single_img_roi)) * img_id
+            single_img_roi = torch.cat(
+                [roi_img_id[:, None], single_img_roi], dim=1)
+            new_rois.append(single_img_roi)
+        rois = torch.cat(new_rois)
+        roi_feats = feats[0].new_zeros(self.fuse_level,
+                                       rois.size(0), self.out_channels, *out_size)
+        for i in range(num_levels):
+            if len(rois) > 0:
+                rois_ = rois
+                ori_dtype = feats[i].dtype
+                roi_feats_t = self.roi_layers[i](feats[i].to(
+                    torch.float32), rois_.to(torch.float32))
+                roi_feats[i] = roi_feats_t.to(ori_dtype)
+            else:
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        fuse_roi_feats = []
+        for i in range(self.fuse_level):
+            fuse_roi_feats.append(self.pconvs[i](roi_feats[i]))
+        fuse_roi_feats = sum(fuse_roi_feats)
+        fuse_roi_feats = F.relu(fuse_roi_feats)
+        fuse_roi_feats = fuse_roi_feats.flatten(1, -1)
+        fuse_roi_feats = self.flatten_linear(fuse_roi_feats)
+        fuse_roi_feats = fuse_roi_feats + pos_embedd
+        fuse_roi_feats = self.updims(fuse_roi_feats)
+        query_feats = []
+        for i in range(num_imgs):
+            mask = rois[:, 0] == i
+            query_feats.append(fuse_roi_feats[mask])
+        return query_feats
+class MLVLROIQueryModule(nn.Module):
+    def __init__(self, embed_dims=1024, out_dims=4096,
+                 num_levels=3):
+        super(MLVLROIQueryModule, self).__init__()
+        self.mlvl_fuse = MLVLFuseModule(input_dims=embed_dims,
+                                        embed_dims=embed_dims,
+                                        num_levels=num_levels,
+                                        num_fuse=5)
+        strids = [14 / 8, 14 / 4, 14 / 2, 14]
+        assert len(strids) == num_levels
+        bbox_roi_extractor = dict(roi_layer=dict(type='RoIAlign',
+                                                 output_size=14,
+                                                 sampling_ratio=2),
+                                  out_channels=embed_dims,
+                                  embed_dims=embed_dims,
+                                  fuse_level=num_levels,
+                                  featmap_strides=strids)
+        self.roi_align = MlvlRoIExtractor(**bbox_roi_extractor)
+    def forward(self, mlvl_feats, bboxes):
+        if mlvl_feats[0].dim() == 3:
+            h = w = int(math.sqrt(mlvl_feats[0].shape[1]))
+            assert h == 24
+            assert w == 24
+            b, c = mlvl_feats[0].shape[0], mlvl_feats[0].shape[-1]
+            mlvl_feats = [item.reshape(b, h, w, c).permute(0, 3, 1, 2) for item in mlvl_feats]
+        base_shape = mlvl_feats[0].shape[-2:]
+        num_level = len(mlvl_feats)
+        to_shape = [(base_shape[0] * 2 ** level, base_shape[1] * 2 ** level)
+                    for level in range(num_level)]
+        to_shape = to_shape[::-1]
+        for level in range(num_level):
+            feat = mlvl_feats[level]
+            shape = to_shape[level]
+            # feat = feat
+            # mlvl_feats[level] = F.interpolate(feat, size=shape, mode='bilinear', align_corners=True)
+            # todo: temporary fix for "upsample_bilinear2d_out_frame" not implemented for 'BFloat16'
+            feat = feat.to(torch.float32)
+            mlvl_feats[level] = F.interpolate(
+                feat, size=shape, mode='bilinear', align_corners=True)
+            mlvl_feats[level] = mlvl_feats[level].to(torch.bfloat16)
+        mlvl_feats = self.mlvl_fuse(mlvl_feats)
+        return self.roi_align(mlvl_feats, bboxes)

projects/glamm/utils.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from enum import Enum
+import numpy as np
+import torch
+import torch.distributed as dist
+from transformers import PreTrainedModel
+from typing import List, Optional
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_EOS_TOKEN = '</s>'
+DEFAULT_BOS_TOKEN = '<s>'
+DEFAULT_UNK_TOKEN = '<unk>'
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_BBOX_TOKEN = "<bbox>"
+# Modified from https://github.com/haotian-liu/LLaVA/blob/82fc5e0e5f4393a4c26851fa32c69ab37ea3b146/llava/model/llava_arch.py#L99  # noqa: E501
+def prepare_inputs_labels_for_multimodal(
+        llm: PreTrainedModel,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        **kwargs):
+    if pixel_values is None:
+        kwargs.update({
+            'input_ids': input_ids,
+            'position_ids': position_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past_key_values,
+            'inputs_embeds': None,
+            'labels': labels
+        })
+        return kwargs
+    _labels = labels
+    _position_ids = position_ids
+    _attention_mask = attention_mask
+    if attention_mask is None:
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+    else:
+        attention_mask = attention_mask.bool()
+    if position_ids is None:
+        position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+    if labels is None:
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+    # remove the padding using attention_mask -- TODO: double check
+    input_ids = [
+        cur_input_ids[cur_attention_mask]
+        for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+    ]
+    labels = [
+        cur_labels[cur_attention_mask]
+        for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+    ]
+    new_inputs_embeds = []
+    new_labels = []
+    new_input_ids = []
+    cur_image_idx = 0
+    for batch_idx, cur_input_ids in enumerate(input_ids):
+        num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+        if num_images == 0:
+            cur_pixel_values = pixel_values[cur_image_idx]
+            cur_inputs_embeds_1 = llm.get_input_embeddings()(cur_input_ids)
+            cur_inputs_embeds = torch.cat([cur_inputs_embeds_1, cur_pixel_values[0:0]], dim=0)
+            new_inputs_embeds.append(cur_inputs_embeds)
+            new_labels.append(labels[batch_idx])
+            new_input_ids.append(cur_input_ids)
+            cur_image_idx += 1
+            continue
+        image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+        cur_input_ids_noim = []
+        cur_labels = labels[batch_idx]
+        cur_labels_noim = []
+        for i in range(len(image_token_indices) - 1):
+            cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1:image_token_indices[i + 1]])
+            cur_labels_noim.append(cur_labels[image_token_indices[i] + 1:image_token_indices[i + 1]])
+        split_sizes = [x.shape[0] for x in cur_labels_noim]
+        cur_inputs_embeds = llm.get_input_embeddings()(torch.cat(cur_input_ids_noim))
+        cur_inputs_embeds_no_im = torch.split(cur_inputs_embeds, split_sizes, dim=0)
+        cur_new_inputs_embeds = []
+        cur_new_labels = []
+        cur_new_input_ids = []
+        for i in range(num_images + 1):
+            cur_new_inputs_embeds.append(cur_inputs_embeds_no_im[i])
+            cur_new_labels.append(cur_labels_noim[i])
+            cur_new_input_ids.append(cur_input_ids_noim[i])
+            if i < num_images:
+                cur_pixel_values = pixel_values[cur_image_idx]
+                cur_image_idx += 1
+                cur_new_inputs_embeds.append(cur_pixel_values)
+                cur_new_labels.append(torch.full((cur_pixel_values.shape[0], ), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+                cur_new_input_ids.append(torch.full((cur_pixel_values.shape[0], ), IMAGE_TOKEN_INDEX, device=cur_input_ids.device, dtype=cur_input_ids.dtype))
+        cur_new_inputs_embeds = torch.cat(cur_new_inputs_embeds)
+        cur_new_labels = torch.cat(cur_new_labels)
+        cur_new_input_ids = torch.cat(cur_new_input_ids)
+        new_inputs_embeds.append(cur_new_inputs_embeds)
+        new_labels.append(cur_new_labels)
+        new_input_ids.append(cur_new_input_ids)
+    # Combine them
+    max_len = max(x.shape[0] for x in new_inputs_embeds)
+    batch_size = len(new_inputs_embeds)
+    new_inputs_embeds_padded = []
+    new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX,  dtype=new_labels[0].dtype,  device=new_labels[0].device)
+    new_input_ids_padded = torch.full((batch_size, max_len), IGNORE_INDEX,  dtype=new_input_ids[0].dtype,  device=new_input_ids[0].device)
+    attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+    position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+    for i, (cur_new_embed, cur_new_labels, cur_new_input_ids) in enumerate(zip(new_inputs_embeds, new_labels, new_input_ids)):
+        cur_len = cur_new_embed.shape[0]
+        new_inputs_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype,  device=cur_new_embed.device)), dim=0))
+        if cur_len > 0:
+            new_labels_padded[i, :cur_len] = cur_new_labels
+            new_input_ids_padded[i, :cur_len] = cur_new_input_ids
+            attention_mask[i, :cur_len] = True
+            position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+    new_inputs_embeds = torch.stack(new_inputs_embeds_padded, dim=0)
+    if _labels is None:
+        new_labels = None
+    else:
+        new_labels = new_labels_padded
+    new_input_ids = new_input_ids_padded
+    if _attention_mask is None:
+        attention_mask = None
+    else:
+        attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+    if _position_ids is None:
+        position_ids = None
+    kwargs.update({
+        'input_ids': None,
+        'position_ids': position_ids,
+        'attention_mask': attention_mask,
+        'past_key_values': past_key_values,
+        'inputs_embeds': new_inputs_embeds,
+        'labels': new_labels,
+        'new_input_ids': new_input_ids
+    })
+    return kwargs
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def all_reduce(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if isinstance(self.sum, np.ndarray):
+            total = torch.tensor(
+                self.sum.tolist()
+                + [
+                    self.count,
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+        else:
+            total = torch.tensor(
+                [self.sum, self.count], dtype=torch.float32, device=device
+            )
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        if total.shape[0] > 2:
+            self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
+        else:
+            self.sum, self.count = total.tolist()
+        self.avg = self.sum / (self.count + 1e-5)
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+    def summary(self):
+        fmtstr = ""
+        if self.summary_type is Summary.NONE:
+            fmtstr = ""
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = "{name} {avg:.3f}"
+        elif self.summary_type is Summary.SUM:
+            fmtstr = "{name} {sum:.3f}"
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = "{name} {count:.3f}"
+        else:
+            raise ValueError("invalid summary type %r" % self.summary_type)
+        return fmtstr.format(**self.__dict__)
+def intersectionAndUnionGPU(output, target, K, ignore_index=255):
+    # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
+    assert output.dim() in [1, 2, 3]
+    assert output.shape == target.shape
+    output = output.view(-1)
+    target = target.view(-1)
+    output[target == ignore_index] = ignore_index
+    intersection = output[output == target]
+    area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
+    area_output = torch.histc(output, bins=K, min=0, max=K - 1)
+    area_target = torch.histc(target, bins=K, min=0, max=K - 1)
+    area_union = area_output + area_target - area_intersection
+    return area_intersection, area_union, area_target
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("\t".join(entries))
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(" ".join(entries))
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+def dict_to_cuda(input_dict):
+    for k, v in input_dict.items():
+        if isinstance(input_dict[k], torch.Tensor):
+            input_dict[k] = v.cuda(non_blocking=True)
+        elif isinstance(v, list) and len(v) > 0:
+            input_dict[k] = [ele.cuda(non_blocking=True) if isinstance(ele, torch.Tensor) else ele for ele in v]
+    return input_dict

projects/llava_sam2/configs/sa2va_4b.py ADDED Viewed

	@@ -0,0 +1,548 @@

+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from torch.optim import AdamW
+from transformers import AutoTokenizer
+from xtuner.dataset import ConcatDataset
+from xtuner.dataset.samplers import LengthGroupedSampler
+from xtuner.engine.hooks import DatasetInfoHook
+from xtuner.engine.runner import TrainLoop
+from xtuner.utils import PROMPT_TEMPLATE
+from xtuner.dataset.map_fns import template_map_fn_factory
+from third_parts.mmdet.models.losses import DiceLoss, CrossEntropyLoss
+from peft import LoraConfig
+from projects.llava_sam2.models.internvl import InternVL_Slowfast
+from projects.llava_sam2.models import VideoLLaVASAMModel, SAM2TrainRunner, VideoLLaVASAMModel_zero3
+from projects.llava_sam2.datasets import VideoReVOSDataset, VideoMeVISDataset, VideoRefYoutubeVOSDataset, video_lisa_collate_fn, VideoSAM2Dataset
+from projects.llava_sam2.datasets import VideoChatUniViDataset
+from projects.llava_sam2.datasets import RefCOCOgGCGDataset, OpenPsgGCGDataset, FlickrGCGDataset, GranDfGCGDataset, OspreyDataset, OspreyDescriptionDataset, OspreyShortDescriptionDataset
+from projects.llava_sam2.datasets import LLaVADataset
+from projects.llava_sam2.datasets import ReferSegmDataset
+from projects.llava_sam2.models.preprocess.image_resize import DirectResize
+#######################################################################
+#                          PART 1  Settings                           #
+#######################################################################
+# Model
+path = './pretrained/InternVL2_5-4B'
+pretrained_pth = None
+# Data
+prompt_template = PROMPT_TEMPLATE.phi3_chat
+max_length = 8192
+# Scheduler & Optimizer
+batch_size = 2  # per_device
+accumulative_counts = 4
+dataloader_num_workers = 4
+max_epochs = 1
+optim_type = AdamW
+# official 1024 -> 4e-5
+# lr = 1e-6
+lr = 4e-5
+betas = (0.9, 0.999)
+weight_decay = 0.05
+max_norm = 1  # grad clip
+warmup_ratio = 0.05
+# Save
+save_steps = 1000
+save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)
+special_tokens = ['[SEG]', '<p>', '</p>', '<vp>', '</vp>']
+tokenizer = dict(
+    type=AutoTokenizer.from_pretrained,
+    pretrained_model_name_or_path=path,
+    trust_remote_code=True,
+    padding_side='right')
+extra_image_processor = dict(
+    type=DirectResize,
+    target_length=1024,
+)
+#######################################################################
+#            PART 2  Model & Tokenizer & Image Processor              #
+#######################################################################
+model = dict(
+    type=VideoLLaVASAMModel_zero3,
+    special_tokens=special_tokens,
+    frozen_sam2_decoder=False,
+    mllm=dict(
+        type=InternVL_Slowfast,
+        model_path=path,
+        freeze_llm=True,
+        freeze_visual_encoder=True,
+        llm_lora=dict(
+            type=LoraConfig,
+            r=128,
+            lora_alpha=256,
+            lora_dropout=0.05,
+            bias='none',
+            task_type='CAUSAL_LM'),
+        special_tokens=special_tokens,
+    ),
+    tokenizer=tokenizer,
+    grounding_encoder=dict(
+        type=SAM2TrainRunner,
+    ),
+    loss_mask=dict(
+        type=CrossEntropyLoss,
+        use_sigmoid=True,
+        reduction='mean',
+        loss_weight=2.0),
+    loss_dice=dict(
+        type=DiceLoss,
+        use_sigmoid=True,
+        activate=True,
+        reduction='mean',
+        naive_dice=True,
+        eps=1.0,
+        loss_weight=0.5),
+    pretrained_pth=pretrained_pth,
+    loss_sample_points=True,
+    # loss_sample_points=False,
+    bs=batch_size,
+)
+#######################################################################
+#                      PART 3  Dataset & Dataloader                   #
+#######################################################################
+VIDEO_DATAS = './data/video_datas/'
+IMG_DATAS = './data/image_datas/'
+############### video res
+data_root_revos = './data/video_datas/revos/'
+video_revos_image_folder = data_root_revos
+video_revos_expression_file = data_root_revos + 'meta_expressions_train_.json'
+video_revos_mask_file = data_root_revos + 'mask_dict.json'
+data_root_mevis = './data/video_datas/mevis/train/'
+video_mevis_image_folder = data_root_mevis + 'JPEGImages'
+video_mevis_expression_file = data_root_mevis + 'meta_expressions.json'
+video_mevis_mask_file = data_root_mevis + 'mask_dict.json'
+data_root_refytvos = './data/video_datas/rvos/'
+video_refytvos_image_folder = data_root_refytvos + 'train/JPEGImages/'
+video_refytvos_expression_file = data_root_refytvos + 'meta_expressions/train/meta_expressions.json'
+video_refytvos_mask_file = data_root_refytvos + 'mask_dict.pkl'
+video_revos_dataset = dict(
+    type=VideoReVOSDataset,
+    image_folder=video_revos_image_folder,
+    expression_file=video_revos_expression_file,
+    mask_file=video_revos_mask_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=10,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    sampled_frames=5,
+)
+video_mevis_dataset = dict(
+    type=VideoMeVISDataset,
+    image_folder=video_mevis_image_folder,
+    expression_file=video_mevis_expression_file,
+    mask_file=video_mevis_mask_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=4,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    sampled_frames=5,
+)
+video_refytvos_dataset = dict(
+    type=VideoRefYoutubeVOSDataset,
+    image_folder=video_refytvos_image_folder,
+    expression_file=video_refytvos_expression_file,
+    mask_file=video_refytvos_mask_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=4,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    sampled_frames=5,
+)
+################### Video chat
+data_root_video_chatunivi = VIDEO_DATAS + 'video_vlm/video_chat/'
+video_chatunivi_image_folder = data_root_video_chatunivi + 'Activity_Videos/'
+video_chatunivi_json_file = data_root_video_chatunivi+ 'video_chat.json'
+video_qa_dataset = dict(
+    type=VideoChatUniViDataset,
+    image_folder=video_chatunivi_image_folder,
+    json_file=video_chatunivi_json_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    sampled_frames=5,
+)
+################## image chat
+llava_vqa_dataset = dict(
+    type=LLaVADataset,
+    tokenizer=tokenizer,
+    data_path='data/llava_data/LLaVA-Instruct-150K/llava_v1_5_mix665k.json',
+    prompt_template=prompt_template,
+    special_tokens=special_tokens,
+    image_folder='data/llava_data/llava_images/',
+)
+################## image res
+refcoco_segm_dataset=dict(
+    type=ReferSegmDataset,
+    tokenizer=tokenizer,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    data_root='data/ref_seg/refcoco',
+    data_prefix=dict(img_path='coco2014/train2014/'),
+    ann_file='instances.json',
+    split_file='refs(unc).p',
+    prompt_template=prompt_template,
+    num_classes_per_sample=5,
+    max_length=max_length,
+)
+refcoco_plus_segm_dataset=dict(
+    type=ReferSegmDataset,
+    tokenizer=tokenizer,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    data_root='data/ref_seg/refcoco+',
+    data_prefix=dict(img_path='coco2014/train2014/'),
+    ann_file='instances.json',
+    split_file='refs(unc).p',
+    prompt_template=prompt_template,
+    num_classes_per_sample=5,
+    max_length=max_length,
+)
+refcocog_segm_dataset=dict(
+    type=ReferSegmDataset,
+    tokenizer=tokenizer,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    data_root='data/ref_seg/refcocog',
+    data_prefix=dict(img_path='coco2014/train2014/'),
+    ann_file='instances.json',
+    split_file='refs(umd).p',
+    prompt_template=prompt_template,
+    num_classes_per_sample=5,
+    max_length=max_length,
+)
+# image gcg datas
+glamm_data_root = './data/glamm_data/'
+refcocog_image_path = glamm_data_root + 'images/coco2014/train2014/'
+refcocog_ann_file = glamm_data_root + 'annotations/RefCOCOg_GCG_train.json'
+grandf_image_path = glamm_data_root + 'images/grandf/train/'
+grandf_ann_file = glamm_data_root + 'annotations/GranDf_HA_GCG_train.json'
+flickr_image_path = glamm_data_root + 'images/flickr30k/Flickr30K/'
+flickr_ann_file = glamm_data_root + 'annotations/flickr_mergedGT_GCG_train.json'
+psg_image_path = glamm_data_root + 'images/coco2017/'
+psg_ann_file = glamm_data_root + 'annotations/OpenPsgGCG_train.json'
+glamm_refcocog_dataset = dict(
+    type=RefCOCOgGCGDataset,
+    image_folder=refcocog_image_path,
+    data_path=refcocog_ann_file,
+    tokenizer=tokenizer,
+    max_length=max_length,
+    special_tokens=special_tokens,
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    extra_image_processor=extra_image_processor,
+    lazy=True,
+    repeats=1,
+)
+glamm_grandf_dataset = dict(
+    type=GranDfGCGDataset,
+    data_path=grandf_ann_file,
+    image_folder=grandf_image_path,
+    tokenizer=tokenizer,
+    max_length=max_length,
+    special_tokens=special_tokens,
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    extra_image_processor=extra_image_processor,
+    lazy=True,
+    repeats=10,
+)
+glamm_psg_dataset = dict(
+    type=OpenPsgGCGDataset,
+    data_path=psg_ann_file,
+    image_folder=psg_image_path,
+    tokenizer=tokenizer,
+    max_length=max_length,
+    special_tokens=special_tokens,
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    extra_image_processor=extra_image_processor,
+    lazy=True,
+    repeats=1,
+)
+glamm_flickr_dataset = dict(
+    type=FlickrGCGDataset,
+    data_path=flickr_ann_file,
+    image_folder=flickr_image_path,
+    tokenizer=tokenizer,
+    max_length=max_length,
+    special_tokens=special_tokens,
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    extra_image_processor=extra_image_processor,
+    lazy=True,
+    repeats=1,
+)
+# sam2 data
+data_sam2_folder = VIDEO_DATAS + 'segmentation_datasets/sam_v_full/'
+data_sam2_expression_file = './whole_pesudo_cap_v3/sam_v_final_v3.json'
+video_sam2_dataset = dict(
+    type=VideoSAM2Dataset,
+    sam2_folder=data_sam2_folder,
+    expression_file=data_sam2_expression_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=4,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    sampled_frames=5,
+    select_number=5,
+)
+# osprey
+data_osprey_file = VIDEO_DATAS + 'osprey-724k/Osprey-724K/osprey_conversation.json'
+data_osprey_image_folders = [
+    IMG_DATAS+ 'coco/train2014/',
+    IMG_DATAS + 'coco/val2014/',
+    IMG_DATAS + 'coco/train2017/',
+    IMG_DATAS + 'coco/val2017/',
+]
+image_osprey_dataset = dict(
+    type=OspreyDataset,
+    image_folder=data_osprey_image_folders,
+    data_path=data_osprey_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+)
+data_osprey_detail_description_file = VIDEO_DATAS + 'osprey-724k/Osprey-724K/osprey_detail_description.json'
+image_osprey_description_dataset = dict(
+    type=OspreyDescriptionDataset,
+    image_folder=data_osprey_image_folders,
+    data_path=data_osprey_detail_description_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+)
+data_osprey_short_file = VIDEO_DATAS + 'osprey-724k/Osprey-724K/osprey_short_form.json'
+image_osprey_short_dataset = dict(
+    type=OspreyShortDescriptionDataset,
+    image_folder=data_osprey_image_folders,
+    data_path=data_osprey_short_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+)
+data_osprey_part_file = VIDEO_DATAS + 'osprey-724k/Osprey-724K/osprey_part_level.json'
+image_osprey_part_dataset = dict(
+    type=OspreyDataset,
+    image_folder=data_osprey_image_folders,
+    data_path=data_osprey_part_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+)
+data_osprey_positive_neg_file = VIDEO_DATAS + 'osprey-724k/Osprey-724K/osprey_lvis_positive_negative.json'
+image_osprey_positive_neg_dataset = dict(
+    type=OspreyDataset,
+    image_folder=data_osprey_image_folders,
+    data_path=data_osprey_positive_neg_file,
+    tokenizer=tokenizer,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    lazy=True,
+    repeats=1,
+    special_tokens=special_tokens,
+)
+train_dataset = dict(
+    type=ConcatDataset, datasets=[
+        # sem seg
+        # semantic_seg_ade20k_dataset,
+        # ref seg
+        refcoco_segm_dataset, refcoco_plus_segm_dataset, refcocog_segm_dataset,
+        refcoco_segm_dataset, refcoco_plus_segm_dataset, refcocog_segm_dataset,
+        refcoco_segm_dataset, refcoco_plus_segm_dataset, refcocog_segm_dataset,
+        refcoco_segm_dataset, refcoco_plus_segm_dataset, refcocog_segm_dataset,
+        # image qa
+        llava_vqa_dataset,
+        # video res
+        video_mevis_dataset, video_revos_dataset, video_refytvos_dataset,
+        # video chat
+        video_qa_dataset,
+        # sam2 pesudo
+        video_sam2_dataset,
+        # gcg data
+        glamm_psg_dataset,
+        glamm_grandf_dataset,
+        glamm_flickr_dataset,
+        glamm_refcocog_dataset,
+        # visual prompt
+        image_osprey_dataset, image_osprey_description_dataset,
+        image_osprey_part_dataset, image_osprey_short_dataset,
+        image_osprey_positive_neg_dataset,
+    ]
+)
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=dataloader_num_workers,
+    dataset=train_dataset,
+    sampler=dict(
+        type=LengthGroupedSampler,
+        length_property='modality_length',
+        per_device_batch_size=batch_size * accumulative_counts),
+    collate_fn=dict(type=video_lisa_collate_fn)
+)
+#######################################################################
+#                    PART 4  Scheduler & Optimizer                    #
+#######################################################################
+# optimizer
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(
+        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
+    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
+    accumulative_counts=accumulative_counts,
+    loss_scale='dynamic',
+    dtype='bfloat16'
+)
+# learning policy
+# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=warmup_ratio * max_epochs,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=0.0,
+        by_epoch=True,
+        begin=warmup_ratio * max_epochs,
+        end=max_epochs,
+        convert_to_iter_based=True)
+]
+# train, val, test setting
+train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
+#######################################################################
+#                           PART 5  Runtime                           #
+#######################################################################
+# Log the dialogue periodically during the training process, optional
+custom_hooks = [
+    # dict(type=DatasetInfoHook, tokenizer=tokenizer),
+]
+# configure default hooks
+default_hooks = dict(
+    # record the time of every iteration.
+    timer=dict(type=IterTimerHook),
+    # print log every 10 iterations.
+    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
+    # enable the parameter scheduler.
+    param_scheduler=dict(type=ParamSchedulerHook),
+    # save checkpoint per `save_steps`.
+    checkpoint=dict(
+        type=CheckpointHook,
+        save_optimizer=False,
+        by_epoch=False,
+        interval=save_steps,
+        max_keep_ckpts=save_total_limit),
+    # set sampler seed in distributed evrionment.
+    sampler_seed=dict(type=DistSamplerSeedHook),
+)
+# configure environment
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    # set distributed parameters
+    dist_cfg=dict(backend='nccl'),
+)
+# set visualizer
+visualizer = None
+# set log level
+log_level = 'INFO'
+# load from which checkpoint
+load_from = None
+# whether to resume training from the loaded checkpoint
+resume = False
+# Defaults to use random seed and disable `deterministic`
+randomness = dict(seed=None, deterministic=False)
+# set log processor
+log_processor = dict(by_epoch=False)

projects/llava_sam2/datasets/ChatUniVi_Dataset.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import logging
+import os
+from typing import Literal
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict, load_from_disk
+from mmengine import print_log
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import build_origin_dataset
+import copy
+from .encode_fn import video_lisa_encode_fn
+import json
+import cv2
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from decord import VideoReader, cpu
+def _get_rawvideo_dec(video_path, select_frames=5):
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    elif os.path.exists(video_path.replace('mkv', 'mp4')):
+        vreader = VideoReader(video_path.replace('mkv', 'mp4'), ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0
+    f_end = len(vreader) - 1
+    num_frames = f_end - f_start + 1
+    assert num_frames > 0, f'num_frames: {num_frames}, f_start: {f_start}, f_end: {f_end}, fps: {fps}, video_path: {video_path}'
+    # T x 3 x H x W
+    if num_frames <= select_frames:
+        sample_pos = range(f_start, f_end + 1)
+    else:
+        split_point = np.linspace(0, num_frames, num=select_frames+1, dtype=int)
+        sample_pos = [np.random.randint(split_point[i], split_point[i+1]) for i in range(select_frames)]
+    patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+    return patch_images
+class VideoChatUniViDataset(Dataset):
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    FAST_IMG_CONTEXT_TOKEN = '<FAST_IMG_CONTEXT>'
+    FAST_IMG_START_TOKEN = '<fast_img>'
+    FAST_IMG_END_TOKEN = '</fast_img>'
+    def __init__(self,
+                 image_folder,
+                 json_file,
+                 extra_image_processor=None,
+                 tokenizer=None,
+                 sampled_frames=10,
+                 offline_processed_text_folder=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 lazy=True,
+                 repeats=1,
+                 special_tokens=None,
+                 use_fast=False,
+                 n_fast_images=50,
+                 fast_pool_size=4,
+                 arch_type: Literal['intern_vl', 'qwen'] = 'intern_vl',
+                 preprocessor=None,
+    ):
+        assert lazy is True
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.sampled_frames = sampled_frames
+        assert offline_processed_text_folder or (json_file and tokenizer)
+        self.lazy = lazy
+        self.max_length = max_length
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if offline_processed_text_folder and json_file:
+            print_log(
+                'Both `offline_processed_text_folder` and '
+                '`data_path` are set, and we load dataset from'
+                '`offline_processed_text_folder` '
+                f'({offline_processed_text_folder})',
+                logger='current',
+                level=logging.WARNING)
+        if offline_processed_text_folder is not None:
+            raise NotImplementedError
+        else:
+            json_datas = self.json_file_preprocess(json_file)
+            self.json_datas = json_datas
+            json_data = DatasetDict({'train': HFDataset.from_list(json_datas)})
+            if self.lazy:
+                self.text_data = build_origin_dataset(json_data, 'train')
+            else:
+                raise NotImplementedError
+        self.image_folder = image_folder
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.arch_type = arch_type
+        if self.arch_type == 'qwen':
+            self.IMG_CONTEXT_TOKEN = '<|image_pad|>'
+            self.IMG_START_TOKEN = '<|vision_start|>'
+            self.IMG_END_TOKEN = '<|vision_end|>'
+        elif self.arch_type == 'llava':
+            self.IMG_CONTEXT_TOKEN = '<image>'
+            self.IMG_START_TOKEN = ''
+            self.IMG_END_TOKEN = ''
+        self.repeats = repeats
+        self._system = ''
+        self.downsample_ratio = 0.5
+        if self.arch_type == 'llava':
+            self.downsample_ratio = 1
+        self.image_size = 448
+        if self.arch_type == 'llava':
+            self.image_size = 336
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        if self.arch_type == 'qwen':
+            self.patch_token = 1
+        if preprocessor is None:
+            self.transformer = T.Compose([
+                T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+            ])
+            self.preprocessor = None
+        else:
+            self.transformer = None
+            self.preprocessor = BUILDER.build(preprocessor)
+        self.arch_type = arch_type
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.use_fast = use_fast
+        self.n_fast_images = n_fast_images
+        self.fast_pool_size = fast_pool_size
+        # for visualization debug
+        self.save_folder = './work_dirs/video_debug/'
+        self.cur_number = 0
+        print("Video Chat dataset, include {} items.".format(len(self.text_data)))
+    def __len__(self):
+        return len(self.text_data) * self.repeats
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.text_data:
+            cur_len = 10000
+            length_list.append(cur_len)
+        return length_list
+    def real_len(self):
+        return len(self.text_data)
+    def json_file_preprocess(self, json_file):
+        # prepare expression annotation files
+        with open(json_file, 'r') as f:
+            json_datas = json.load(f)
+        return json_datas
+    def dataset_map_fn(self, data_dict, select_k=5):
+        assert 'video' in data_dict
+        # video
+        video_file = data_dict['video']
+        video_file = os.path.join(self.image_folder, video_file)
+        images = _get_rawvideo_dec(video_file, select_frames=select_k)
+        if self.use_fast:
+            fast_images = _get_rawvideo_dec(video_file, select_frames=self.n_fast_images)
+        else:
+            fast_images = None
+        conversation = data_dict['conversations']
+        # prepare text
+        if self.use_fast:
+            text_dict = self.prepare_text(
+                select_k, conversation, num_image_tokens=self.patch_token,
+                n_fast_images=len(fast_images),
+            )
+        else:
+            text_dict = self.prepare_text(
+                select_k, conversation, num_image_tokens=self.patch_token,
+            )
+        ret = {'images': images, 'conversation': text_dict['conversation'], 'fast_images': fast_images}
+        return ret
+    def prepare_text(self, n_frames, conversation, num_image_tokens=256, n_fast_images=0):
+        if self.use_fast:
+            fast_frame_token_str = f'{self.FAST_IMG_START_TOKEN}' \
+                          f'{self.FAST_IMG_CONTEXT_TOKEN * n_fast_images * self.fast_pool_size * self.fast_pool_size}' \
+                          f'{self.FAST_IMG_END_TOKEN}' + '\n'
+        else:
+            fast_frame_token_str = ''
+        frame_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        questions = []
+        answers = []
+        for conv in conversation:
+            if conv['from'] == 'human':
+                questions.append(conv['value'].replace('<image>', ''))
+            else:
+                answers.append(conv['value'])
+        assert len(questions) == len(answers)
+        qa_list = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                frame_tokens = frame_token_str + '\n'
+                # frame_tokens = '=' + ' '
+                frame_tokens = frame_tokens * n_frames
+                frame_tokens = frame_tokens.strip()
+                frame_tokens = fast_frame_token_str + frame_tokens
+                qa_list.append(
+                    {'from': 'human', 'value': frame_tokens + question}
+                )
+            else:
+                qa_list.append(
+                    {'from': 'human', 'value': question}
+                )
+            qa_list.append(
+                {'from': 'gpt', 'value': answer}
+            )
+        input = ''
+        conversation = []
+        for msg in qa_list:
+            if msg['from'] == 'human':
+                input += msg['value']
+            elif msg['from'] == 'gpt':
+                conversation.append({'input': input, 'output': msg['value']})
+                input = ''
+            else:
+                raise NotImplementedError
+        # add system information
+        conversation[0].update({'system': self._system})
+        return {'conversation': conversation}
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        selected_data_dict = copy.deepcopy(self.text_data[index])
+        data_dict = self.dataset_map_fn(selected_data_dict, select_k=self.sampled_frames)
+        assert 'images' in data_dict.keys()
+        if self.use_fast:
+            assert 'fast_images' in data_dict.keys()
+        pixel_values = []
+        num_video_tokens = None
+        num_frame_tokens = None
+        if data_dict.get('images', None) is not None:
+            frames_files = data_dict['images']
+            for frame_image in frames_files:
+                frame_image = frame_image.convert('RGB')
+                ori_width, ori_height = frame_image.size
+                if self.preprocessor is not None:
+                    pass
+                else:
+                    frame_image = self.transformer(frame_image)
+                pixel_values.append(frame_image)
+            if self.preprocessor is not None:
+                if self.arch_type == 'qwen':
+                    _data_dict = self.preprocessor(pixel_values, do_resize=True, size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    _data_dict['image_grid_thw'] = torch.tensor(_data_dict['image_grid_thw'], dtype=torch.int)
+                    num_frame_tokens = int(_data_dict['image_grid_thw'][0].prod() * (self.downsample_ratio ** 2))
+                    num_frames = _data_dict['image_grid_thw'].shape[0]
+                    num_video_tokens = num_frame_tokens * num_frames
+                elif self.arch_type == 'llava':
+                    _data_dict = self.preprocessor(pixel_values, do_resize=True,
+                                                   size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = np.stack(_data_dict['pixel_values'], axis=0)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                else:
+                    raise NotImplementedError
+                data_dict.update(_data_dict)
+            else:
+                pixel_values = torch.stack(pixel_values, dim=0) # (n_f, 3, h, w)
+                data_dict['pixel_values'] = pixel_values
+        else:
+            data_dict['pixel_values'] = torch.zeros(0, 3, self.image_size, self.image_size)
+            data_dict['masks'] = None
+        if num_video_tokens is not None:
+            assert self.patch_token == 1
+            input_str = data_dict['conversation'][0]['input']
+            input_str = input_str.replace(self.IMG_CONTEXT_TOKEN, self.IMG_CONTEXT_TOKEN * num_frame_tokens)
+            assert input_str.count(self.IMG_CONTEXT_TOKEN) == num_video_tokens
+            data_dict['conversation'][0]['input'] = input_str
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length, with_image_token=True)
+        data_dict.update(result)
+        # for fast branch
+        if self.use_fast:
+            fast_pixel_values = []
+            frames_files = data_dict['fast_images']
+            for frame_image in frames_files:
+                frame_image = frame_image.convert('RGB')
+                ori_width, ori_height = frame_image.size
+                frame_image = self.transformer(frame_image)
+                fast_pixel_values.append(frame_image)
+            fast_pixel_values = torch.stack(fast_pixel_values, dim=0)  # (n_f, 3, h, w)
+            data_dict['fast_pixel_values'] = fast_pixel_values
+        # # for debug
+        # self.visualization_debug(data_dict)
+        # if self.cur_number < 10:
+        #     return self[random.randint(0, len(self))]
+        data_dict['type'] = 'video'
+        return data_dict
+    def visualization_debug(self, data_dict):
+        save_folder = os.path.join(self.save_folder, 'sample_{}'.format(self.cur_number))
+        if not os.path.exists(save_folder):
+            os.mkdir(save_folder)
+        self.cur_number += 1
+        # images
+        show_images = []
+        pixel_values = data_dict['pixel_values']
+        save_folder_image = os.path.join(save_folder, 'image')
+        if not os.path.exists(save_folder_image):
+            os.mkdir(save_folder_image)
+        for i_image, image_pixel_value in enumerate(pixel_values):
+            # print(image_pixel_value.shape)
+            image_pixel_value[0] = image_pixel_value[0] * 0.2686
+            image_pixel_value[1] = image_pixel_value[1] * 0.2613
+            image_pixel_value[2] = image_pixel_value[2] * 0.2757
+            image_pixel_value[0] = image_pixel_value[0] + 0.4814
+            image_pixel_value[1] = image_pixel_value[1] + 0.4578
+            image_pixel_value[2] = image_pixel_value[2] + 0.4082
+            image_pixel_value = image_pixel_value * 255
+            image_pixel_value = image_pixel_value.permute(1, 2, 0)
+            image_pixel_value = image_pixel_value.to(torch.uint8).numpy()
+            # print(os.path.join(save_folder_image, '{}.jpg'.format(i_image)))
+            # print(image_pixel_value.shape)
+            show_images.append(image_pixel_value)
+            cv2.imwrite(os.path.join(save_folder_image, '{}.jpg'.format(i_image)), image_pixel_value)
+        # text
+        input_text = self.tokenizer.decode(data_dict['input_ids'], skip_special_tokens=False)
+        with open(os.path.join(save_folder, 'text.json'), 'w') as f:
+            json.dump([input_text], f)
+        return

projects/llava_sam2/datasets/GCG_Dataset.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import json
+import os
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict, load_from_disk
+from PIL import Image
+from torch.utils.data import Dataset
+from pycocotools import mask
+import numpy as np
+import copy
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import process_hf_dataset, build_origin_dataset
+import torchvision.transforms as T
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+from torchvision.transforms.functional import InterpolationMode
+from .encode_fn import video_lisa_encode_fn
+from .utils import dynamic_preprocess
+from .gcg_process import glamm_openpsg_map_fn, glamm_flickr_map_fn, glamm_granf_map_fn, glamm_refcocog_map_fn
+class GCGDataset(Dataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+    ):
+        super().__init__()
+        assert lazy
+        self.lazy = lazy
+        self.max_length = max_length
+        json_data = self.json_file_preprocess(data_path)
+        json_data = DatasetDict({'train': HFDataset.from_list(json_data)})
+        self.text_data = build_origin_dataset(json_data, 'train')
+        self.image_folder = image_folder
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.repeats = repeats
+        self._system = ''
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.single_image_mode = single_image_mode
+    def json_file_preprocess(self, data_path):
+        with open(data_path, 'r') as f:
+            json_data = json.load(f)
+        return json_data
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.text_data:
+            if self.lazy:
+                cur_len = 100
+            else:
+                cur_len = len(data_dict['input_ids'])
+                if data_dict.get('image', None) is None:
+                    cur_len = -cur_len
+            length_list.append(cur_len)
+        return length_list * self.repeats
+    def __len__(self):
+        return len(self.text_data) * self.repeats
+    def real_len(self):
+        return len(self.text_data)
+    def decode_mask(self, object_masks, ori_height, ori_width):
+        binary_masks = []
+        for object_mask in object_masks:
+            binary_mask = np.zeros((ori_height, ori_width), dtype=np.uint8)
+            for seg in object_mask:
+                rles = mask.frPyObjects([seg], ori_height, ori_width)
+                m = mask.decode(rles)
+                m = m.astype(np.uint8)
+                binary_mask += m.squeeze()
+            binary_masks.append(binary_mask)
+        if len(binary_masks) == 0:
+            return None
+        masks = np.stack(binary_masks, axis=0)
+        masks = torch.from_numpy(masks)
+        return masks
+    def dataset_map_fn(self, data_dict):
+        data_dict = glamm_refcocog_map_fn(data_dict)
+        return data_dict
+    def replace_image_str(self, data_dict, image_str):
+        data_dict['conversation'][0]['input'] = \
+            data_dict['conversation'][0]['input'].replace(DEFAULT_IMAGE_TOKEN, image_str)
+        return data_dict
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = copy.deepcopy(self.text_data[index])
+        # parse datasets
+        result = self.dataset_map_fn(data_dict)
+        data_dict.update(result)
+        # process image
+        image_file = data_dict['image']
+        image = Image.open(os.path.join(self.image_folder,
+                                        image_file)).convert('RGB')
+        ori_width, ori_height = image.size
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            data_dict['g_pixel_values'] = g_pixel_values
+        if self.single_image_mode:
+            images = [image]
+        else:
+            images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                        self.max_dynamic_patch,
+                                        self.image_size, self.use_thumbnail)
+        pixel_values = [self.transformer(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        data_dict['pixel_values'] = pixel_values
+        num_image_tokens = pixel_values.shape[0] * self.patch_token
+        image_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        data_dict = self.replace_image_str(data_dict, image_token_str)
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length,
+                                      with_image_token=True)
+        data_dict.update(result)
+        # process mask
+        data_dict['masks'] = self.decode_mask(data_dict['masks'], ori_height=ori_height, ori_width=ori_width)
+        if data_dict['masks'] is None:
+            return self.__getitem__(0)
+        return data_dict
+class RefCOCOgGCGDataset(GCGDataset):
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 ):
+        super().__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )
+    def json_file_preprocess(self, data_path):
+        json_data = json.load(open(data_path))
+        # convert {id: dict} to dict(..., id=xx)
+        for idx in range(len(json_data)):
+            id = list(json_data[idx].keys())[0]
+            json_data[idx] = json_data[idx][id]
+            json_data[idx].update({'id': id})
+        return json_data
+class GranDfGCGDataset(GCGDataset):
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 ):
+        super().__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )
+    def dataset_map_fn(self, data_dict):
+        data_dict = glamm_granf_map_fn(data_dict)
+        return data_dict
+    def decode_mask(self, object_masks, ori_height, ori_width):
+        binary_masks = []
+        for object_mask in object_masks:
+            binary_mask = np.zeros((ori_height, ori_width), dtype=np.uint8)
+            for rle in object_mask:
+                m = mask.decode(rle).astype(np.uint8)
+                binary_mask += m.squeeze()
+            binary_masks.append(binary_mask)
+        if len(binary_masks) == 0:
+            return None
+        masks = np.stack(binary_masks, axis=0)
+        masks = torch.from_numpy(masks)
+        return masks
+class OpenPsgGCGDataset(GranDfGCGDataset):
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 ):
+        super().__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )
+    def dataset_map_fn(self, data_dict):
+        data_dict = glamm_openpsg_map_fn(data_dict)
+        return data_dict
+class FlickrGCGDataset(GCGDataset):
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 ):
+        super().__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )
+    def dataset_map_fn(self, data_dict):
+        data_dict = glamm_flickr_map_fn(data_dict)
+        return data_dict
+    def json_file_preprocess(self, data_path):
+        def filter_images(data_infos, min_size):
+            return [i for i, info in enumerate(data_infos) if min(info['width'], info['height']) >= min_size]
+        # convert {id: dict} to dict(..., id=xx)
+        from pycocotools.coco import COCO
+        self.coco = COCO(data_path)
+        self.image_ids = self.coco.getImgIds()
+        data_infos = []
+        total_ann_ids = []
+        removed_img_count = 0
+        for img_id in self.image_ids:
+            info = self.coco.loadImgs([img_id])[0]
+            if len(info['caption'].split(' ')) < 3:
+                removed_img_count += 1
+                continue
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            data_infos.append(info)
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(total_ann_ids), f"Non-unique annotation IDs in '{data_path}'!"
+        print(f'Removed {removed_img_count} images.')
+        data_infos = [data_infos[i] for i in filter_images(data_infos, min_size=32)]
+        # obtain_annotations
+        for data_info in data_infos:
+            ann_ids = self.coco.getAnnIds(imgIds=data_info['id'])
+            ann_info = self.coco.loadAnns(ann_ids)
+            data_info.update({'ann_info': ann_info})
+        return data_infos
+    def decode_mask(self, object_masks, ori_height, ori_width):
+        binary_masks = []
+        for object_mask in object_masks:
+            binary_mask = mask.decode(object_mask).astype(np.uint8)
+            binary_masks.append(binary_mask)
+        if len(binary_masks) == 0:
+            return None
+        masks = np.stack(binary_masks, axis=0)
+        masks = torch.from_numpy(masks)
+        return masks

projects/llava_sam2/datasets/Grand_Dataset.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import json
+import os
+import random
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict, load_from_disk
+from PIL import Image
+from torch.utils.data import Dataset
+from pycocotools import mask
+import numpy as np
+import copy
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import process_hf_dataset, build_origin_dataset
+import torchvision.transforms as T
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+from torchvision.transforms.functional import InterpolationMode
+from .encode_fn import video_lisa_encode_fn
+from .utils import dynamic_preprocess
+from .grand_process import glamm_grand_map_fn
+class GranDDataset(Dataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 image_folder,
+                 json_folder=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 image_list_save_path='./work_dirs/grand_image.json',
+                 json_list_save_path='./work_dirs/grand_jsons.json',
+    ):
+        super().__init__()
+        assert lazy
+        self.lazy = lazy
+        self.max_length = max_length
+        self.image_list_save_path = image_list_save_path
+        self.json_list_save_path = json_list_save_path
+        json_files, image_path_dict = self.json_file_preprocess(image_folder, json_folder)
+        self.json_data = json_files
+        self.image_path_dict = image_path_dict
+        self.image_folder = image_folder
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.repeats = repeats
+        self._system = ''
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.single_image_mode = single_image_mode
+    def json_file_preprocess(self, image_folder, json_folder):
+        # list jsons
+        print("Processing GRAND json files !!!")
+        if os.path.exists(self.json_list_save_path):
+            with open(self.json_list_save_path, 'r') as f:
+                json_files = json.load(f)
+        else:
+            json_files = os.listdir(json_folder)
+            _json_files = []
+            for _file in json_files:
+                if '.json' in _file:
+                    _json_files.append(os.path.join(json_folder, _file))
+            json_files = _json_files
+            with open(self.json_list_save_path, 'w') as f:
+                json.dump(json_files, f)
+        print(f"Finished, {len(json_files)} json files !")
+        # list images
+        print("Processing GRAND image files !!!")
+        if os.path.exists(self.image_list_save_path):
+            with open(self.image_list_save_path, 'r') as f:
+                image_path_dict = json.load(f)
+        else:
+            sub_folders = os.listdir(image_folder)
+            _sub_folders = []
+            for folder_name in sub_folders:
+                if 'sa_00' in folder_name:
+                    _sub_folders.append(folder_name)
+            sub_folders = _sub_folders
+            sub_folders = [os.path.join(image_folder, folder_name) for folder_name in sub_folders]
+            image_path_dict = {}
+            for sub_folder in sub_folders:
+                files = os.listdir(sub_folder)
+                for _file in files:
+                    if '.jpg' in _file:
+                        image_path_dict[_file] = os.path.join(sub_folder, _file)
+            with open(self.image_list_save_path, 'w') as f:
+                json.dump(image_path_dict, f)
+        print(f"Finished, {len(image_path_dict)} image files !")
+        return json_files, image_path_dict
+    @property
+    def modality_length(self):
+        length_list = [10000] * len(self.json_data)
+        return length_list * self.repeats
+    def __len__(self):
+        return len(self.json_data) * self.repeats
+    def real_len(self):
+        return len(self.json_data)
+    def decode_mask(self, object_masks, ori_height, ori_width):
+        binary_masks = []
+        for object_mask in object_masks:
+            binary_mask = np.zeros((ori_height, ori_width), dtype=np.uint8)
+            for seg in object_mask:
+                m = mask.decode(seg)
+                m = m.astype(np.uint8)
+                binary_mask += m.squeeze()
+            binary_masks.append(binary_mask)
+        if len(binary_masks) == 0:
+            return None
+        masks = np.stack(binary_masks, axis=0)
+        masks = torch.from_numpy(masks)
+        return masks
+    def dataset_map_fn(self, data_dict):
+        data_dict = glamm_grand_map_fn(data_dict)
+        return data_dict
+    def replace_image_str(self, data_dict, image_str):
+        data_dict['conversation'][0]['input'] = \
+            data_dict['conversation'][0]['input'].replace(DEFAULT_IMAGE_TOKEN, image_str)
+        return data_dict
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        json_file_path = self.json_data[index]
+        with open(json_file_path, 'r') as f:
+            json_dict = json.load(f)
+        image_name = list(json_dict.keys())[0]
+        if image_name not in self.image_path_dict.keys():
+            return self.__getitem__(random.randint(0, len(self.json_data) - 1))
+        image_path = self.image_path_dict[image_name]
+        json_dict = json_dict[image_name]
+        # parse datasets
+        result = self.dataset_map_fn(json_dict)
+        json_dict.update(result)
+        data_dict = json_dict
+        data_dict['image'] = image_path
+        # process image
+        image_file = data_dict['image']
+        try:
+            image = Image.open(os.path.join(self.image_folder,
+                                            image_file)).convert('RGB')
+        except:
+            return self.__getitem__(random.randint(0, len(self.json_data) - 1))
+        ori_width, ori_height = image.size
+        if hasattr(self, 'extra_image_processor'):
+            g_image = np.array(image)  # for grounding
+            g_image = self.extra_image_processor.apply_image(g_image)
+            g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+            data_dict['g_pixel_values'] = g_pixel_values
+        if self.single_image_mode:
+            images = [image]
+        else:
+            images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                        self.max_dynamic_patch,
+                                        self.image_size, self.use_thumbnail)
+        pixel_values = [self.transformer(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        data_dict['pixel_values'] = pixel_values
+        num_image_tokens = pixel_values.shape[0] * self.patch_token
+        image_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        data_dict = self.replace_image_str(data_dict, image_token_str)
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length,
+                                      with_image_token=True)
+        data_dict.update(result)
+        # process mask
+        data_dict['masks'] = self.decode_mask(data_dict['masks'], ori_height=ori_height, ori_width=ori_width)
+        if data_dict['masks'] is None:
+            return self.__getitem__(random.randint(0, len(self.json_data) - 1))
+        return data_dict

projects/llava_sam2/datasets/MeVIS_Dataset.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .ReVOS_Dataset import VideoReVOSDataset
+class VideoMeVISDataset(VideoReVOSDataset):
+    pass

projects/llava_sam2/datasets/Osprey_Dataset.py ADDED Viewed

	@@ -0,0 +1,463 @@

+import json
+import os
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict, load_from_disk
+from PIL import Image
+from torch.utils.data import Dataset
+from pycocotools import mask as maskUtils
+import numpy as np
+import copy
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import process_hf_dataset, build_origin_dataset
+import torchvision.transforms as T
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+from torchvision.transforms.functional import InterpolationMode
+from .encode_fn import video_lisa_encode_fn
+from .utils import dynamic_preprocess
+import random
+import torch.nn.functional as F
+class OspreyDataset(Dataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    LIMIT = ''
+    VP_START_TOKEN = '<vp>'
+    VP_END_TOKEN = '</vp>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+    ):
+        super().__init__()
+        assert lazy
+        self.lazy = lazy
+        self.max_length = max_length
+        json_data = self.json_file_preprocess(data_path)
+        self.text_data = json_data
+        self.image_folder = image_folder
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.repeats = repeats
+        self._system = ''
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_size = patch_size
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.single_image_mode = single_image_mode
+    def json_file_preprocess(self, data_path):
+        with open(data_path, 'r') as f:
+            json_data = json.load(f)
+        return json_data
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.text_data:
+            if self.lazy:
+                cur_len = 100
+            else:
+                cur_len = len(data_dict['input_ids'])
+                if data_dict.get('image', None) is None:
+                    cur_len = -cur_len
+            length_list.append(cur_len)
+        return length_list * self.repeats
+    def __len__(self):
+        return len(self.text_data) * self.repeats
+    def real_len(self):
+        return len(self.text_data)
+    def annToMask(self, mask_ann, h, w):
+        if isinstance(mask_ann, list):
+            rles = maskUtils.frPyObjects(mask_ann, h, w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, h, w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+    def decode_mask(self, object_masks, ori_height, ori_width):
+        binary_masks = []
+        for object_mask in object_masks:
+            binary_mask = self.annToMask(object_mask, ori_height, ori_width)
+            binary_masks.append(binary_mask)
+        if len(binary_masks) == 0:
+            return None
+        masks = np.stack(binary_masks, axis=0)
+        masks = torch.from_numpy(masks)
+        return masks
+    def _process_conversation(self, converations, n_regions, region_pixels):
+        start_region_str = '<image> There are {} part regions in the picture: '.format(n_regions)
+        for i in range(n_regions):
+            start_region_str = start_region_str + \
+                               f"region{i+1}" + self.VP_START_TOKEN + self.IMG_CONTEXT_TOKEN * region_pixels[i] + self.VP_END_TOKEN
+            if i == n_regions - 1:
+                start_region_str = start_region_str + '.\n'
+            else:
+                start_region_str = start_region_str + ', '
+        for i, item in enumerate(converations):
+            item['value'] = item['value'].replace('<', '').replace('>', '')
+            if item['from'] == 'human':
+                item['value'] = item['value'] + self.LIMIT
+            # first conv process
+            if i == 0:
+                assert item['from'] == "human"
+                item['value'] =  start_region_str + item['value']
+        messages = converations
+        input = ''
+        conversation = []
+        while messages and messages[0]['from'] == 'gpt':
+            # Skip the first one if it is from gpt
+            messages = messages[1:]
+        for msg in messages:
+            if msg['from'] == 'human':
+                if DEFAULT_IMAGE_TOKEN in msg['value']:
+                    msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                        '').strip()
+                    msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                    msg['value'] = msg['value'].strip()
+                input += msg['value']
+            elif msg['from'] == 'gpt':
+                conversation.append({'input': input, 'output': msg['value']})
+                input = ''
+            else:
+                raise NotImplementedError
+        return conversation
+    def _get_region_infos(self, masks):
+        # masks tensor, (n_obj, h, w)
+        masks = F.interpolate(
+            masks.unsqueeze(0),
+            size=(int(self.image_size // self.patch_size * self.downsample_ratio),
+                  int(self.image_size // self.patch_size * self.downsample_ratio)),
+            mode='nearest').squeeze(0)
+        region_pixels = []
+        for mask in masks:
+            region_pixels.append(mask.bool().to(torch.int64).sum())
+        return masks, region_pixels
+    def dataset_map_fn(self, data_dict):
+        file_name = data_dict['file_name'] # image file name
+        conversations = data_dict['conversations']
+        masks = [anno["segmentation"] for anno in data_dict["annotation"]]
+        height = data_dict['height']
+        width = data_dict['width']
+        _ret = {}
+        _ret['image'] = file_name
+        _ret['height'] = height
+        _ret['width'] = width
+        masks = self.decode_mask(masks, height, width)
+        masks, region_pixels = self._get_region_infos(masks)
+        if masks is None:
+            return None
+        conversations = self._process_conversation(conversations, len(masks), region_pixels)
+        _ret['conversation'] = conversations
+        _ret['prompt_masks'] = masks
+        return _ret
+    def replace_image_str(self, data_dict, image_str):
+        data_dict['conversation'][0]['input'] = \
+            data_dict['conversation'][0]['input'].replace(DEFAULT_IMAGE_TOKEN, image_str)
+        return data_dict
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = copy.deepcopy(self.text_data[index])
+        # parse datasets
+        result = self.dataset_map_fn(data_dict) # {'image', 'height', 'width', 'conversation', 'masks'}
+        if result is None or result['prompt_masks'] is None:
+            return self.__getitem__(0)
+        data_dict = result
+        # process image
+        image_file = data_dict['image']
+        if isinstance(self.image_folder, list):
+            for image_folder in self.image_folder:
+                image_path = os.path.join(image_folder, image_file)
+                if os.path.exists(image_path):
+                    image = Image.open(image_path).convert('RGB')
+                    break
+        else:
+            image = Image.open(os.path.join(self.image_folder,
+                                            image_file)).convert('RGB')
+        ori_width, ori_height = image.size
+        if self.single_image_mode:
+            images = [image]
+        else:
+            images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                        self.max_dynamic_patch,
+                                        self.image_size, self.use_thumbnail)
+        vp_overall_mask = torch.Tensor([False] * (len(images) - 1) + [True])
+        data_dict['vp_overall_mask'] = vp_overall_mask
+        pixel_values = [self.transformer(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        data_dict['pixel_values'] = pixel_values
+        num_image_tokens = pixel_values.shape[0] * self.patch_token
+        image_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        data_dict = self.replace_image_str(data_dict, image_token_str)
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length,
+                                      with_image_token=True)
+        data_dict.update(result)
+        # process mask
+        # data_dict['prompt_masks'] = data_dict['prompt_masks']
+        if data_dict['prompt_masks'] is None:
+            return self.__getitem__(0)
+        return data_dict
+DETAILED_QUESTIONS =  [
+    'Can you provide me with a detailed description of the region in the picture marked by <region>?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail?",
+    'What can you tell me about the region indicated by <region> in the image?',
+    "I'd like to know more about the area in the photo labeled <region>. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail?',
+    'What details can you give me about the region outlined by <region> in the photo?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail?",
+    'What is the region outlined by <region> in the picture like? Could you give me a detailed description?',
+    'Can you provide me with a detailed description of the region in the picture marked by <region>, please?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail, please?",
+    'What can you tell me about the region indicated by <region> in the image, exactly?',
+    "I'd like to know more about the area in the photo labeled <region>, please. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail, please?',
+    'What details can you give me about the region outlined by <region> in the photo, please?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image, please.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture, please?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail, please?",
+    'What is the region outlined by <region> in the picture like, please? Could you give me a detailed description?',
+    'Please describe the region <region> in the image in detail.',
+    'Can you offer a thorough analysis of the region <region> in the image?',
+    'Could you elaborate on the region highlighted by <region> in the picture provided?',
+    'Please share more information about the zone emphasized with <region> in the photo.',
+    'What insights can you give about the area denoted by <region> in the image presented?',
+    'Can you share a comprehensive rundown of the region denoted by <region> in the presented image?',
+    "I'd like to know more about the region highlighted by <region> in the picture provided.",
+    'Work through the important details of the area <region> in the image.',
+    'Illustrate the area represented by <region> through a descriptive explanation.',
+    'Examine the region <region> closely and share its details.'
+]
+class OspreyDescriptionDataset(OspreyDataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    VP_START_TOKEN = '<vp>'
+    VP_END_TOKEN = '</vp>'
+    LIMIT=''
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+    ):
+        super(OspreyDescriptionDataset, self).__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )
+    def dataset_map_fn(self, data_dict):
+        file_name = data_dict['file_name'] # image file name
+        descriptions = data_dict['description']
+        masks = [anno["segmentation"] for anno in data_dict["annotation"]]
+        height = data_dict['height']
+        width = data_dict['width']
+        _ret = {}
+        _ret['image'] = file_name
+        _ret['height'] = height
+        _ret['width'] = width
+        masks = self.decode_mask(masks, height, width)
+        masks, region_pixels = self._get_region_infos(masks)
+        if masks is None:
+            return None
+        conversations = self._process_conversation(descriptions, len(masks), region_pixels)
+        _ret['conversation'] = conversations
+        _ret['prompt_masks'] = masks
+        return _ret
+    def _process_conversation(self, descriptions, n_regions, region_pixels):
+        start_region_str = '<image> There are {} part regions in the picture: '.format(n_regions)
+        for i in range(n_regions):
+            start_region_str = start_region_str + \
+                               f"region{i+1}" + self.VP_START_TOKEN + self.IMG_CONTEXT_TOKEN * region_pixels[i] + self.VP_END_TOKEN
+            if i == n_regions - 1:
+                start_region_str = start_region_str + '.\n'
+            else:
+                start_region_str = start_region_str + ', '
+        converations = []
+        for i, item in enumerate(descriptions):
+            question = random.choice(DETAILED_QUESTIONS).strip().replace('<region>', f"region{i+1}") + self.LIMIT
+            answer = item.replace('<', '').replace('>', '')
+            # first conv process
+            if i == 0:
+                question = start_region_str + question
+            converations.append({'from': 'human', 'value': question})
+            converations.append({'from': 'gpt', 'value': answer})
+        messages = converations
+        input = ''
+        conversation = []
+        while messages and messages[0]['from'] == 'gpt':
+            # Skip the first one if it is from gpt
+            messages = messages[1:]
+        for msg in messages:
+            if msg['from'] == 'human':
+                if DEFAULT_IMAGE_TOKEN in msg['value']:
+                    msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                        '').strip()
+                    msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                    msg['value'] = msg['value'].strip()
+                input += msg['value']
+            elif msg['from'] == 'gpt':
+                conversation.append({'input': input, 'output': msg['value']})
+                input = ''
+            else:
+                raise NotImplementedError
+        return conversation
+class OspreyShortDescriptionDataset(OspreyDataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    VP_START_TOKEN = '<vp>'
+    VP_END_TOKEN = '</vp>'
+    LIMIT = ' Answer the question using a single word or phrase.'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 image_folder,
+                 data_path=None,
+                 tokenizer=None,
+                 max_length=8196,
+                 special_tokens=None,
+                 template_map_fn=None,
+                 extra_image_processor=None,
+                 lazy=True,
+                 repeats=1,
+                 single_image_mode=False,
+                 ):
+        super(OspreyShortDescriptionDataset, self).__init__(
+            image_folder=image_folder,
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            special_tokens=special_tokens,
+            template_map_fn=template_map_fn,
+            extra_image_processor=extra_image_processor,
+            lazy=lazy,
+            repeats=repeats,
+            single_image_mode=single_image_mode,
+        )

projects/llava_sam2/datasets/ReSAM2_Dataset.py ADDED Viewed

	@@ -0,0 +1,489 @@

+import logging
+import os
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict, load_from_disk
+from mmengine import print_log
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import process_hf_dataset, build_origin_dataset
+import copy
+from .encode_fn import video_lisa_encode_fn
+import json
+import random
+import pycocotools.mask as maskUtils
+import cv2
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+SEG_QUESTIONS = [
+    "Please segment the object according to the description: {class_name}",
+]
+SEG_QUESTIONS_SHORT = [
+    "Can you segment the {class_name} in this image?",
+    "Please segment {class_name} in this image.",
+    "What is {class_name} in this image? Please respond with segmentation mask.",
+    "What is {class_name} in this image? Please output segmentation mask.",
+    "Can you segment the {class_name} in this image",
+    "Please segment {class_name} in this image",
+    "What is {class_name} in this image? Please respond with segmentation mask",
+    "What is {class_name} in this image? Please output segmentation mask",
+    "Could you provide a segmentation mask for the {class_name} in this image?",
+    "Please identify and segment the {class_name} in this image.",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask.",
+    "Can you highlight the {class_name} in this image with a segmentation mask?",
+    "Could you provide a segmentation mask for the {class_name} in this image",
+    "Please identify and segment the {class_name} in this image",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask",
+    "Can you highlight the {class_name} in this image with a segmentation mask",
+]
+ANSWER_LIST = [
+    "It is [SEG].",
+    "Sure, [SEG].",
+    "Sure, it is [SEG].",
+    "Sure, the segmentation result is [SEG].",
+    "[SEG].",
+]
+class VideoSAM2Dataset(Dataset):
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    FAST_IMG_CONTEXT_TOKEN = '<FAST_IMG_CONTEXT>'
+    FAST_IMG_START_TOKEN = '<fast_img>'
+    FAST_IMG_END_TOKEN = '</fast_img>'
+    def __init__(self,
+                 sam2_folder,
+                 expression_file,
+                 extra_image_processor=None,
+                 tokenizer=None,
+                 select_number=5,
+                 sampled_frames=5,
+                 offline_processed_text_folder=None,
+                 template_map_fn=None,
+                 max_length=8196,
+                 lazy=True,
+                 repeats=1,
+                 special_tokens=None,
+                 use_fast=False,
+                 n_fast_images=50,
+                 fast_pool_size=4,
+                 mode='long',
+                 frame_contiguous_sample=False,
+    ):
+        assert mode in ['long', 'long_short', 'short']
+        self.mode = mode
+        self.cur_mode = mode
+        assert lazy is True
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.select_number = select_number
+        self.sampled_frames = sampled_frames
+        assert offline_processed_text_folder or (expression_file and tokenizer)
+        self.lazy = lazy
+        self.max_length = max_length
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if offline_processed_text_folder and expression_file:
+            print_log(
+                'Both `offline_processed_text_folder` and '
+                '`data_path` are set, and we load dataset from'
+                '`offline_processed_text_folder` '
+                f'({offline_processed_text_folder})',
+                logger='current',
+                level=logging.WARNING)
+        if offline_processed_text_folder is not None:
+            raise NotImplementedError
+        else:
+            video_ids, anno_dict = self.json_file_preprocess(expression_file)
+            if self.lazy:
+                self.video_ids = video_ids
+                self.anno_dict = anno_dict
+            else:
+                raise NotImplementedError
+        self.sam2_folder = sam2_folder
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.down_ratio = 1
+        self.repeats = repeats
+        self._system = ''
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.use_fast = use_fast
+        self.n_fast_images = n_fast_images
+        self.fast_pool_size = fast_pool_size
+        self.frame_contiguous_sample = frame_contiguous_sample
+        # for visualization debug
+        self.save_folder = './work_dirs/video_debug/'
+        self.cur_number = 0
+        print("Video res dataset (ref-sam2), include {} items.".format(len(self.video_ids)))
+    def __len__(self):
+        return len(self.video_ids) * self.repeats
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.video_ids:
+            cur_len = 20000
+            length_list.append(cur_len)
+        return length_list
+    def real_len(self):
+        return len(self.video_ids)
+    def json_file_preprocess(self, expression_file):
+        # prepare expression annotation files
+        with open(expression_file, 'r') as f:
+            expression_datas = json.load(f)
+        video_ids = list(expression_datas.keys())
+        return video_ids, expression_datas
+    def dataset_map_fn(self, objects_expression_infos, n_frames, n_fast_frames=0):
+        # prepare text
+        if self.mode == 'long':
+            expressions = [object_info['formated'] for object_info in objects_expression_infos]
+            self.cur_mode = self.mode
+        elif self.mode == 'short':
+            expressions = [object_info['short_caps'][random.randint(0, len(object_info['short_caps'])-1)] for object_info in objects_expression_infos]
+            self.cur_mode = self.mode
+        else:
+            if random.random() < 0.5:
+                expressions = [object_info['formated'] for object_info in objects_expression_infos]
+                self.cur_mode = 'long'
+            else:
+                expressions = [object_info['short_caps'][random.randint(0, len(object_info['short_caps']) - 1)] for
+                               object_info in objects_expression_infos]
+                self.cur_mode = 'short'
+        text_dict = self.prepare_text(n_frames, expressions, num_image_tokens=self.patch_token,
+                                      n_fast_frames=n_fast_frames)
+        ret = {'conversation': text_dict['conversation']}
+        return ret
+    def prepare_text(self, n_frames, expressions, num_image_tokens=256, n_fast_frames=0):
+        if self.use_fast:
+            fast_frame_token_str = f'{self.FAST_IMG_START_TOKEN}' \
+                          f'{self.FAST_IMG_CONTEXT_TOKEN * n_fast_frames * self.fast_pool_size * self.fast_pool_size}' \
+                          f'{self.FAST_IMG_END_TOKEN}' + '\n'
+        else:
+            fast_frame_token_str = ''
+        frame_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        questions = []
+        answers = []
+        for i, exp in enumerate(expressions):
+            if self.cur_mode == 'short':
+                question_template = random.choice(SEG_QUESTIONS_SHORT)
+                exp = exp.replace("A ", '')
+            else:
+                question_template = random.choice(SEG_QUESTIONS)
+            questions.append(question_template.format(class_name=exp))
+            answers.append(random.choice(ANSWER_LIST))
+        qa_list = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                frame_tokens = frame_token_str + '\n'
+                # frame_tokens = '=' + ' '
+                frame_tokens = frame_tokens * n_frames
+                frame_tokens = frame_tokens.strip()
+                frame_tokens = fast_frame_token_str + frame_tokens
+                qa_list.append(
+                    {'from': 'human', 'value': frame_tokens + question}
+                )
+            else:
+                qa_list.append(
+                    {'from': 'human', 'value': question}
+                )
+            qa_list.append(
+                {'from': 'gpt', 'value': answer}
+            )
+        input = ''
+        conversation = []
+        for msg in qa_list:
+            if msg['from'] == 'human':
+                input += msg['value']
+            elif msg['from'] == 'gpt':
+                conversation.append({'input': input, 'output': msg['value']})
+                input = ''
+            else:
+                raise NotImplementedError
+        # add system information
+        conversation[0].update({'system': self._system})
+        return {'conversation': conversation}
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        video_id = self.video_ids[index]
+        expression_dict = self.anno_dict[video_id]
+        object_ids = list(expression_dict['objects'].keys())
+        video_path = os.path.join(self.sam2_folder, expression_dict['video_path'])
+        anno_path = os.path.join(self.sam2_folder, expression_dict['anno_path'])
+        video_frames = get_video_frames(video_path)
+        if self.use_fast:
+            # sample fast branch
+            fast_interval = len(video_frames) / (self.n_fast_images + 1e-4)
+            sampled_fast_frame_idxs = [min(int(i * fast_interval), len(video_frames) - 1) for i in range(self.n_fast_images)]
+            fast_video_frames = [video_frames[_idx] for _idx in sampled_fast_frame_idxs]
+        else:
+            fast_video_frames = None
+        video_frames = video_frames[::4]
+        # mask annotation
+        with open(anno_path, 'r') as f:
+            mask_data = json.load(f)
+        masklents = decode_masklet(mask_data['masklet'])
+        n_frames = len(masklents)
+        n_objects = len(object_ids)
+        # sample object
+        if n_objects > self.select_number:
+            selected_indexes = np.random.choice(n_objects, self.select_number)
+        else:
+            selected_indexes = np.random.choice(n_objects, self.select_number, replace=True)
+        selected_object_ids = [object_ids[_idx] for _idx in selected_indexes]
+        objects_expression_infos = [expression_dict['objects'][_idx] for _idx in selected_object_ids]
+        _masklents = []
+        for _mask in masklents:
+            _mask_selected = []
+            for _idx in selected_object_ids:
+                _mask_selected.append(_mask[:, :, int(_idx)])
+            _mask_selected = np.stack(_mask_selected, axis=2)
+            _masklents.append(_mask_selected)
+        masklents = _masklents
+        # sample video frames
+        # prepare images, random select k frames
+        if n_frames > self.sampled_frames + 1:
+            if self.frame_contiguous_sample and random.random() < 0.5:
+                # do contiguous sample
+                selected_start_frame = np.random.choice(n_frames - self.sampled_frames, 1, replace=False)
+                selected_frame_indexes = [selected_start_frame[0] + _i for _i in range(self.sampled_frames)]
+            else:
+                selected_frame_indexes = np.random.choice(n_frames, self.sampled_frames, replace=False)
+        else:
+            selected_frame_indexes = np.random.choice(n_frames, self.sampled_frames, replace=True)
+        selected_frame_indexes.sort()
+        video_frames = [video_frames[_idx] for _idx in selected_frame_indexes]
+        masklents = [masklents[_idx] for _idx in selected_frame_indexes]
+        data_dict = self.dataset_map_fn(objects_expression_infos, len(video_frames), n_fast_frames=self.n_fast_images)
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length, with_image_token=True)
+        data_dict.update(result)
+        pixel_values = []
+        extra_pixel_values = []
+        for frame in video_frames:
+            frame = frame[:, :, ::-1]
+            frame_image = Image.fromarray(frame).convert('RGB')
+            ori_width, ori_height = frame_image.size
+            if self.extra_image_processor is not None:
+                g_image = np.array(frame_image)  # for grounding
+                g_image = self.extra_image_processor.apply_image(g_image)
+                g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+                extra_pixel_values.append(g_pixel_values)
+            frame_image = self.transformer(frame_image)
+            pixel_values.append(frame_image)
+        pixel_values = torch.stack(pixel_values, dim=0)  # (n_f, 3, h, w)
+        data_dict['pixel_values'] = pixel_values
+        if self.extra_image_processor is not None:
+            data_dict['g_pixel_values'] = extra_pixel_values
+        # for fast branch
+        if self.use_fast:
+            fast_pixel_values = []
+            for frame_image in fast_video_frames:
+                frame = frame_image[:, :, ::-1]
+                frame_image = Image.fromarray(frame).convert('RGB')
+                ori_width, ori_height = frame_image.size
+                frame_image = self.transformer(frame_image)
+                fast_pixel_values.append(frame_image)
+            fast_pixel_values = torch.stack(fast_pixel_values, dim=0)  # (n_f, 3, h, w)
+            data_dict['fast_pixel_values'] = fast_pixel_values
+        # process and get masks
+        masklents = np.stack(masklents, axis=0)  # (n_frames, h, w, n_obj)
+        masklents = torch.from_numpy(masklents).permute(3, 0, 1, 2)
+        masklents = masklents.flatten(0, 1)
+        # print('sam2-mask_shape:', masklents.shape)
+        # print('sam2-pixel_values:', data_dict['pixel_values'].shape)
+        # print('sam2-g_pixel_values:', len(data_dict['g_pixel_values']), ', ', data_dict['g_pixel_values'][0].shape)
+        data_dict['masks'] = masklents
+        data_dict['type'] = 'video'
+        return data_dict
+    def visualization_debug(self, data_dict):
+        save_folder = os.path.join(self.save_folder, 'sample_{}'.format(self.cur_number))
+        if not os.path.exists(save_folder):
+            os.mkdir(save_folder)
+        self.cur_number += 1
+        # images
+        show_images = []
+        pixel_values = data_dict['pixel_values']
+        save_folder_image = os.path.join(save_folder, 'image')
+        if not os.path.exists(save_folder_image):
+            os.mkdir(save_folder_image)
+        for i_image, image_pixel_value in enumerate(pixel_values):
+            # print(image_pixel_value.shape)
+            image_pixel_value[0] = image_pixel_value[0] * 0.2686
+            image_pixel_value[1] = image_pixel_value[1] * 0.2613
+            image_pixel_value[2] = image_pixel_value[2] * 0.2757
+            image_pixel_value[0] = image_pixel_value[0] + 0.4814
+            image_pixel_value[1] = image_pixel_value[1] + 0.4578
+            image_pixel_value[2] = image_pixel_value[2] + 0.4082
+            image_pixel_value = image_pixel_value * 255
+            image_pixel_value = image_pixel_value.permute(1, 2, 0)
+            image_pixel_value = image_pixel_value.to(torch.uint8).numpy()
+            # print(os.path.join(save_folder_image, '{}.jpg'.format(i_image)))
+            # print(image_pixel_value.shape)
+            show_images.append(image_pixel_value)
+            cv2.imwrite(os.path.join(save_folder_image, '{}.jpg'.format(i_image)), image_pixel_value)
+        # text
+        input_text = self.tokenizer.decode(data_dict['input_ids'], skip_special_tokens=False)
+        with open(os.path.join(save_folder, 'text.json'), 'w') as f:
+            json.dump([input_text], f)
+        # masks
+        save_folder_mask = os.path.join(save_folder, 'mask')
+        if not os.path.exists(save_folder_mask):
+            os.mkdir(save_folder_mask)
+        n_frames = len(pixel_values)
+        masks = data_dict['masks']
+        _, h, w = masks.shape
+        masks = masks.reshape(-1, n_frames, h, w)
+        for i_obj, obj_masks in enumerate(masks):
+            save_folder_mask_obj_folder = os.path.join(save_folder_mask, 'obj_{}'.format(i_obj))
+            if not os.path.exists(save_folder_mask_obj_folder):
+                os.mkdir(save_folder_mask_obj_folder)
+            for i_frame, f_mask in enumerate(obj_masks):
+                f_mask = f_mask.numpy()
+                f_mask = f_mask * 255
+                f_mask = np.stack([f_mask * 1, f_mask * 0, f_mask * 0], axis=2)
+                f_mask = show_images[i_frame] * 0.3 + 0.7 * f_mask
+                f_mask = f_mask.astype(np.uint8)
+                cv2.imwrite(os.path.join(save_folder_mask_obj_folder, '{}.png'.format(i_frame)), f_mask)
+        return
+def get_video_frames(video_path):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Error: Cannot open video file.")
+        return
+    frames = []
+    frame_id = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+        frame_id += 1
+    cap.release()
+    return frames
+def images_to_video(frames, video_name, fps=6):
+    height, width, layers = frames[0].shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video = cv2.VideoWriter(video_name, fourcc, fps, (width, height))
+    for frame in frames:
+        video.write(frame)
+    # cv2.destroyAllWindows()
+    video.release()
+    return
+def decode_masklet(masklet):
+    masks = []
+    for _rle in masklet:
+        mask = maskUtils.decode(_rle)
+        masks.append(mask)
+    return masks
+def draw_mask(image, mask):
+    obj_mask = mask * 255
+    obj_mask = np.stack([obj_mask * 1, obj_mask * 0, obj_mask * 0], axis=2)
+    obj_mask = obj_mask * 0.5 + copy.deepcopy(image) * 0.5
+    obj_mask = obj_mask.astype(np.uint8)
+    return obj_mask
+def add_mask2images(frames, masklets):
+    show_videos = []
+    for i_frames, (frame, masks) in enumerate(zip(frames, masklets)):
+        if i_frames == 0:
+            n_obj = masks.shape[-1]
+            for i_obj in range(n_obj):
+                show_videos.append([])
+        n_obj = masks.shape[-1]
+        for i_obj in range(n_obj):
+            show_videos[i_obj].append(draw_mask(copy.deepcopy(frame), masks[:, :, i_obj]))
+    return show_videos

projects/llava_sam2/datasets/ReVOS_Dataset.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import logging
+import os
+from typing import Literal
+import torch
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict
+from mmengine import print_log
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+from xtuner.registry import BUILDER
+from xtuner.dataset.huggingface import build_origin_dataset
+import copy
+from .encode_fn import video_lisa_encode_fn
+import json
+import random
+import pycocotools.mask as maskUtils
+import cv2
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+SEG_QUESTIONS = [
+    "Can you segment the {class_name} in this image?",
+    "Please segment {class_name} in this image.",
+    "What is {class_name} in this image? Please respond with segmentation mask.",
+    "What is {class_name} in this image? Please output segmentation mask.",
+    "Can you segment the {class_name} in this image",
+    "Please segment {class_name} in this image",
+    "What is {class_name} in this image? Please respond with segmentation mask",
+    "What is {class_name} in this image? Please output segmentation mask",
+    "Could you provide a segmentation mask for the {class_name} in this image?",
+    "Please identify and segment the {class_name} in this image.",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask.",
+    "Can you highlight the {class_name} in this image with a segmentation mask?",
+    "Could you provide a segmentation mask for the {class_name} in this image",
+    "Please identify and segment the {class_name} in this image",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask",
+    "Can you highlight the {class_name} in this image with a segmentation mask",
+]
+ANSWER_LIST = [
+    "It is [SEG].",
+    "Sure, [SEG].",
+    "Sure, it is [SEG].",
+    "Sure, the segmentation result is [SEG].",
+    "[SEG].",
+]
+class VideoReVOSDataset(Dataset):
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    FAST_IMG_CONTEXT_TOKEN = '<FAST_IMG_CONTEXT>'
+    FAST_IMG_START_TOKEN = '<fast_img>'
+    FAST_IMG_END_TOKEN = '</fast_img>'
+    def __init__(self,
+                 image_folder,
+                 expression_file,
+                 mask_file,
+                 extra_image_processor=None,
+                 tokenizer=None,
+                 select_number=5,
+                 sampled_frames=10,
+                 offline_processed_text_folder=None,
+                 template_map_fn=None,
+                 max_length=2048,
+                 lazy=True,
+                 repeats=1,
+                 special_tokens=None,
+                 frame_contiguous_sample=False,
+                 use_fast=False,
+                 arch_type: Literal['intern_vl', 'qwen'] = 'intern_vl',
+                 preprocessor=None,
+                 # only work if use_fast = True
+                 n_fast_images=50,
+                 fast_pool_size=4,
+                 fast_token_after_question=False,
+    ):
+        assert lazy is True
+        self.tokenizer = BUILDER.build(tokenizer)
+        self.select_number = select_number
+        self.sampled_frames = sampled_frames
+        assert offline_processed_text_folder or (expression_file and tokenizer)
+        self.lazy = lazy
+        self.max_length = max_length
+        self.template_map_fn = template_map_fn
+        if isinstance(self.template_map_fn, dict) and self.lazy:
+            _type = self.template_map_fn['type']
+            del self.template_map_fn['type']
+            self.template_map_fn = _type(**self.template_map_fn)
+        if offline_processed_text_folder and expression_file:
+            print_log(
+                'Both `offline_processed_text_folder` and '
+                '`data_path` are set, and we load dataset from'
+                '`offline_processed_text_folder` '
+                f'({offline_processed_text_folder})',
+                logger='current',
+                level=logging.WARNING)
+        self.arch_type = arch_type
+        if self.arch_type == 'qwen':
+            self.IMG_CONTEXT_TOKEN = '<|image_pad|>'
+            self.IMG_START_TOKEN = '<|vision_start|>'
+            self.IMG_END_TOKEN = '<|vision_end|>'
+        elif self.arch_type == 'llava':
+            self.IMG_CONTEXT_TOKEN = '<image>'
+            self.IMG_START_TOKEN = ''
+            self.IMG_END_TOKEN = ''
+        if offline_processed_text_folder is not None:
+            raise NotImplementedError
+        else:
+            vid2metaid, metas, mask_dict = self.json_file_preprocess(expression_file, mask_file)
+            self.vid2metaid = vid2metaid
+            self.videos = list(self.vid2metaid.keys())
+            self.mask_dict = mask_dict
+            self.json_datas = metas
+            json_datas = metas
+            json_data = DatasetDict({'train': HFDataset.from_list(json_datas)})
+            if self.lazy:
+                self.text_data = build_origin_dataset(json_data, 'train')
+            else:
+                raise NotImplementedError
+        self.image_folder = image_folder
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.down_ratio = 1
+        self.repeats = repeats
+        self._system = ''
+        self.downsample_ratio = 0.5
+        if self.arch_type == 'llava':
+            self.downsample_ratio = 1
+        self.image_size = 448
+        if self.arch_type == 'llava':
+            self.image_size = 336
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        if self.arch_type == 'qwen':
+            self.patch_token = 1
+        if preprocessor is None:
+            self.transformer = T.Compose([
+                T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+            ])
+            self.preprocessor = None
+        else:
+            self.transformer = None
+            self.preprocessor = BUILDER.build(preprocessor)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.use_fast = use_fast
+        self.n_fast_images = n_fast_images
+        self.fast_pool_size = fast_pool_size
+        self.frame_contiguous_sample = frame_contiguous_sample
+        # for visualization debug
+        self.save_folder = './work_dirs/video_debug/'
+        self.cur_number = 0
+        # exist_thr
+        self.exist_thr = 8
+        self.fast_token_after_question = fast_token_after_question
+        if self.fast_token_after_question:
+            assert self.use_fast
+        print("Video res dataset, include {} items.".format(len(self.vid2metaid)))
+    def __len__(self):
+        return len(self.vid2metaid) * self.repeats
+    @property
+    def modality_length(self):
+        length_list = []
+        for data_dict in self.vid2metaid:
+            cur_len = 10000
+            length_list.append(cur_len)
+        return length_list
+    def real_len(self):
+        return len(self.vid2metaid)
+    def json_file_preprocess(self, expression_file, mask_file):
+        # prepare expression annotation files
+        with open(expression_file, 'r') as f:
+            expression_datas = json.load(f)['videos']
+        metas = []
+        anno_count = 0  # serve as anno_id
+        vid2metaid = {}
+        for vid_name in expression_datas:
+            vid_express_data = expression_datas[vid_name]
+            vid_frames = sorted(vid_express_data['frames'])
+            vid_len = len(vid_frames)
+            exp_id_list = sorted(list(vid_express_data['expressions'].keys()))
+            for exp_id in exp_id_list:
+                exp_dict = vid_express_data['expressions'][exp_id]
+                meta = {}
+                meta['video'] = vid_name
+                meta['exp'] = exp_dict['exp']  # str
+                meta['mask_anno_id'] = exp_dict['anno_id']
+                if 'obj_id' in exp_dict.keys():
+                    meta['obj_id'] = exp_dict['obj_id']
+                else:
+                    meta['obj_id'] = [0, ]  # Ref-Youtube-VOS only has one object per expression
+                meta['anno_id'] = [str(anno_count), ]
+                anno_count += 1
+                meta['frames'] = vid_frames
+                meta['exp_id'] = exp_id
+                meta['length'] = vid_len
+                metas.append(meta)
+                if vid_name not in vid2metaid.keys():
+                    vid2metaid[vid_name] = []
+                vid2metaid[vid_name].append(len(metas) - 1)
+        # process mask annotation files
+        with open(mask_file, 'rb') as f:
+            mask_dict = json.load(f)
+        return vid2metaid, metas, mask_dict
+    def create_img_to_refs_mapping(self, refs_train):
+        img2refs = {}
+        for ref in refs_train:
+            img2refs[ref["image_id"]] = img2refs.get(ref["image_id"], []) + [ref, ]
+        return img2refs
+    def decode_mask(self, video_masks, image_size):
+        ret_masks = []
+        for object_masks in video_masks:
+            # None object
+            if len(object_masks) == 0:
+                if len(ret_masks) != 0:
+                    _object_masks = ret_masks[0] * 0
+                else:
+                    _object_masks = np.zeros(
+                        (self.sampled_frames, image_size[0], image_size[1]), dtype=np.uint8)
+            else:
+                _object_masks = []
+                for i_frame in range(len(object_masks[0])):
+                    _mask = np.zeros(image_size, dtype=np.uint8)
+                    for i_anno in range(len(object_masks)):
+                        if object_masks[i_anno][i_frame] is None:
+                            continue
+                        m = maskUtils.decode(object_masks[i_anno][i_frame])
+                        if m.ndim == 3:
+                            m = m.sum(axis=2).astype(np.uint8)
+                        else:
+                            m = m.astype(np.uint8)
+                        _mask = _mask | m
+                    _object_masks.append(_mask)
+                _object_masks = np.stack(_object_masks, axis=0)
+            # if self.pad_image_to_square:
+            #     _object_masks = expand2square_mask(_object_masks)
+            ret_masks.append(_object_masks)
+        _shape = ret_masks[0].shape
+        for item in ret_masks:
+            if item.shape != _shape:
+                print([_ret_mask.shape for _ret_mask in ret_masks])
+                return None
+        ret_masks = np.stack(ret_masks, axis=0)  # (n_obj, n_frames, h, w)
+        ret_masks = torch.from_numpy(ret_masks)
+        # ret_masks = F.interpolate(ret_masks, size=(self.image_size // self.down_ratio,
+        #                           self.image_size // self.down_ratio), mode='nearest')
+        ret_masks = ret_masks.flatten(0, 1)
+        return ret_masks
+    def dataset_map_fn(self, data_dict, select_k=5):
+        images = []
+        len_frames = len(data_dict[0]['frames'])
+        for objet_info in data_dict:
+            assert len_frames == len(objet_info['frames'])
+        # prepare images, random select k frames
+        if len_frames > select_k + 1:
+            if self.frame_contiguous_sample and random.random() < 0.5:
+                # do contiguous sample
+                selected_start_frame = np.random.choice(len_frames - select_k, 1, replace=False)
+                selected_frame_indexes = [selected_start_frame[0] + _i for _i in range(select_k)]
+            else:
+                selected_frame_indexes = np.random.choice(len_frames, select_k, replace=False)
+        else:
+            selected_frame_indexes = np.random.choice(len_frames, select_k, replace=True)
+        selected_frame_indexes.sort()
+        if self.use_fast:
+            # sample fast branch
+            fast_interval = len_frames / (self.n_fast_images + 1e-4)
+            sampled_fast_frame_idxs = [min(int(i * fast_interval), len_frames - 1) for i in range(self.n_fast_images)]
+            fast_video_frames = []
+            for selected_frame_index in sampled_fast_frame_idxs:
+                frame_id = data_dict[0]['frames'][selected_frame_index]
+                fast_video_frames.append(os.path.join(data_dict[0]['video'], frame_id + '.jpg'))
+        else:
+            fast_video_frames = None
+            sampled_fast_frame_idxs = None
+        for selected_frame_index in selected_frame_indexes:
+            frame_id = data_dict[0]['frames'][selected_frame_index]
+            images.append(os.path.join(data_dict[0]['video'], frame_id + '.jpg'))
+        # prepare text
+        expressions = [object_info['exp'] for object_info in data_dict]
+        if self.use_fast:
+            text_dict = self.prepare_text(select_k, expressions, num_image_tokens=self.patch_token,
+                                          n_fast_images=len(fast_video_frames),)
+        else:
+            text_dict = self.prepare_text(select_k, expressions, num_image_tokens=self.patch_token)
+        # prepare masks
+        video_masks = []
+        for object_info in data_dict:
+            anno_ids = object_info['mask_anno_id']
+            # print('anno_ids: ', anno_ids)
+            obj_masks = []
+            for anno_id in anno_ids:
+                anno_id = str(anno_id)
+                frames_masks = self.mask_dict[anno_id]
+                frames_masks_ = []
+                for frame_idx in selected_frame_indexes:
+                    frames_masks_.append(copy.deepcopy(frames_masks[frame_idx]))
+                obj_masks.append(frames_masks_)
+            video_masks.append(obj_masks)
+        if self.use_fast:
+            fast_video_masks = []
+            assert sampled_fast_frame_idxs is not None
+            for object_info in data_dict:
+                anno_ids = object_info['mask_anno_id']
+                obj_masks = []
+                for anno_id in anno_ids:
+                    anno_id = str(anno_id)
+                    frames_masks = self.mask_dict[anno_id]
+                    frames_masks_ = []
+                    for frame_idx in sampled_fast_frame_idxs:
+                        frames_masks_.append(copy.deepcopy(frames_masks[frame_idx]))
+                    obj_masks.append(frames_masks_)
+                fast_video_masks.append(obj_masks)
+        else:
+            fast_video_masks = None
+        ret = {'images': images, 'video_masks': video_masks, 'conversation': text_dict['conversation'],
+               'fast_images': fast_video_frames, 'fast_video_masks': fast_video_masks}
+        return ret
+    def prepare_text(self, n_frames, expressions, num_image_tokens=256, n_fast_images=50):
+        if self.use_fast and not self.fast_token_after_question:
+            fast_frame_token_str = f'{self.FAST_IMG_START_TOKEN}' \
+                          f'{self.FAST_IMG_CONTEXT_TOKEN * n_fast_images * self.fast_pool_size * self.fast_pool_size}' \
+                          f'{self.FAST_IMG_END_TOKEN}' + '\n'
+        else:
+            fast_frame_token_str = ''
+        frame_token_str = f'{self.IMG_START_TOKEN}' \
+                          f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                          f'{self.IMG_END_TOKEN}'
+        if self.fast_token_after_question:
+            assert self.use_fast
+            after_question_str = f'{self.FAST_IMG_START_TOKEN}' \
+                          f'{self.FAST_IMG_CONTEXT_TOKEN * n_fast_images * self.fast_pool_size * self.fast_pool_size}' \
+                          f'{self.FAST_IMG_END_TOKEN}'
+        else:
+            after_question_str = ''
+        questions = []
+        answers = []
+        for i, exp in enumerate(expressions):
+            # the exp is a question
+            if '?' in exp:
+                questions.append(exp)
+            else:
+                exp = exp.replace('.', '').strip()
+                question_template = random.choice(SEG_QUESTIONS)
+                questions.append(question_template.format(class_name=exp.lower()))
+            answers.append(random.choice(ANSWER_LIST))
+        qa_list = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                frame_tokens = frame_token_str + '\n'
+                # frame_tokens = '=' + ' '
+                frame_tokens = frame_tokens * n_frames
+                frame_tokens = frame_tokens.strip()
+                frame_tokens = fast_frame_token_str + frame_tokens
+                qa_list.append(
+                    {'from': 'human', 'value': frame_tokens + question + after_question_str}
+                )
+            else:
+                qa_list.append(
+                    {'from': 'human', 'value': question + after_question_str}
+                )
+            qa_list.append(
+                {'from': 'gpt', 'value': answer}
+            )
+        input = ''
+        conversation = []
+        for msg in qa_list:
+            if msg['from'] == 'human':
+                input += msg['value']
+            elif msg['from'] == 'gpt':
+                conversation.append({'input': input, 'output': msg['value']})
+                input = ''
+            else:
+                raise NotImplementedError
+        # add system information
+        conversation[0].update({'system': self._system})
+        return {'conversation': conversation}
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        selected_video_objects = self.vid2metaid[self.videos[index]]
+        video_objects_infos = [copy.deepcopy(self.text_data[idx]) for idx in selected_video_objects]
+        if len(video_objects_infos) > self.select_number:
+            selected_indexes = np.random.choice(len(video_objects_infos), self.select_number)
+            video_objects_infos = [video_objects_infos[_idx] for _idx in selected_indexes]
+        else:
+            selected_indexes = np.random.choice(len(video_objects_infos), self.select_number, replace=True)
+            video_objects_infos = [video_objects_infos[_idx] for _idx in selected_indexes]
+        data_dict = self.dataset_map_fn(video_objects_infos, select_k=self.sampled_frames)
+        assert 'images' in data_dict.keys()
+        pixel_values = []
+        extra_pixel_values = []
+        num_video_tokens = None
+        num_frame_tokens = None
+        if data_dict.get('images', None) is not None:
+            frames_files = data_dict['images']
+            frames_files = [os.path.join(self.image_folder, frame_file) for frame_file in frames_files]
+            for frame_path in frames_files:
+                frame_image = Image.open(frame_path).convert('RGB')
+                ori_width, ori_height = frame_image.size
+                if self.extra_image_processor is not None:
+                    g_image = np.array(frame_image)  # for grounding
+                    g_image = self.extra_image_processor.apply_image(g_image)
+                    g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+                    extra_pixel_values.append(g_pixel_values)
+                if self.preprocessor is not None:
+                    pass
+                else:
+                    frame_image = self.transformer(frame_image)
+                pixel_values.append(frame_image)
+            if self.preprocessor is not None:
+                if self.arch_type == 'qwen':
+                    _data_dict = self.preprocessor(pixel_values, do_resize=True, size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    _data_dict['image_grid_thw'] = torch.tensor(_data_dict['image_grid_thw'], dtype=torch.int)
+                    num_frame_tokens = int(_data_dict['image_grid_thw'][0].prod() * (self.downsample_ratio ** 2))
+                    num_frames = _data_dict['image_grid_thw'].shape[0]
+                    num_video_tokens = num_frame_tokens * num_frames
+                elif self.arch_type == 'llava':
+                    _data_dict = self.preprocessor(pixel_values, do_resize=True, size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = np.stack(_data_dict['pixel_values'], axis=0)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                else:
+                    raise NotImplementedError
+                data_dict.update(_data_dict)
+            else:
+                pixel_values = torch.stack(pixel_values, dim=0) # (n_f, 3, h, w)
+                data_dict['pixel_values'] = pixel_values
+            if self.extra_image_processor is not None:
+                data_dict['g_pixel_values'] = extra_pixel_values
+            # process and get masks
+            masks = self.decode_mask(data_dict['video_masks'], image_size=(ori_height, ori_width))
+            if masks is None:
+                return self.__getitem__(random.randint(0, self.real_len()))
+            data_dict['masks'] = masks
+        else:
+            data_dict['pixel_values'] = torch.zeros(0, 3, self.image_size, self.image_size)
+            data_dict['masks'] = None
+        if num_video_tokens is not None:
+            assert self.patch_token == 1
+            input_str = data_dict['conversation'][0]['input']
+            input_str = input_str.replace(self.IMG_CONTEXT_TOKEN, self.IMG_CONTEXT_TOKEN * num_frame_tokens)
+            assert input_str.count(self.IMG_CONTEXT_TOKEN) == num_video_tokens
+            data_dict['conversation'][0]['input'] = input_str
+        result = self.template_map_fn(data_dict)
+        data_dict.update(result)
+        result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length)
+        data_dict.update(result)
+        # for fast branch
+        if self.use_fast:
+            fast_pixel_values = []
+            frames_files = data_dict['fast_images']
+            frames_files = [os.path.join(self.image_folder, frame_file) for frame_file in frames_files]
+            for frame_path in frames_files:
+                frame_image = Image.open(frame_path).convert('RGB')
+                ori_width, ori_height = frame_image.size
+                frame_image = self.transformer(frame_image)
+                fast_pixel_values.append(frame_image)
+            fast_pixel_values = torch.stack(fast_pixel_values, dim=0)  # (n_f, 3, h, w)
+            data_dict['fast_pixel_values'] = fast_pixel_values
+            # process and get masks
+            masks = self.decode_mask(data_dict['fast_video_masks'], image_size=(ori_height, ori_width))
+            if masks is None:
+                return self.__getitem__(random.randint(0, self.real_len()))
+            data_dict['fast_exists'] = masks.to(dtype=torch.int).sum(dim=(-2, -1)).ge(self.exist_thr).unsqueeze(-1)
+            del data_dict['fast_video_masks']
+        data_dict['type'] = 'video'
+        return data_dict
+    def visualization_debug(self, data_dict):
+        save_folder = os.path.join(self.save_folder, 'sample_{}'.format(self.cur_number))
+        if not os.path.exists(save_folder):
+            os.mkdir(save_folder)
+        self.cur_number += 1
+        # images
+        show_images = []
+        pixel_values = data_dict['pixel_values']
+        save_folder_image = os.path.join(save_folder, 'image')
+        if not os.path.exists(save_folder_image):
+            os.mkdir(save_folder_image)
+        for i_image, image_pixel_value in enumerate(pixel_values):
+            # print(image_pixel_value.shape)
+            image_pixel_value[0] = image_pixel_value[0] * 0.2686
+            image_pixel_value[1] = image_pixel_value[1] * 0.2613
+            image_pixel_value[2] = image_pixel_value[2] * 0.2757
+            image_pixel_value[0] = image_pixel_value[0] + 0.4814
+            image_pixel_value[1] = image_pixel_value[1] + 0.4578
+            image_pixel_value[2] = image_pixel_value[2] + 0.4082
+            image_pixel_value = image_pixel_value * 255
+            image_pixel_value = image_pixel_value.permute(1, 2, 0)
+            image_pixel_value = image_pixel_value.to(torch.uint8).numpy()
+            # print(os.path.join(save_folder_image, '{}.jpg'.format(i_image)))
+            # print(image_pixel_value.shape)
+            show_images.append(image_pixel_value)
+            cv2.imwrite(os.path.join(save_folder_image, '{}.jpg'.format(i_image)), image_pixel_value)
+        # text
+        input_text = self.tokenizer.decode(data_dict['input_ids'], skip_special_tokens=False)
+        with open(os.path.join(save_folder, 'text.json'), 'w') as f:
+            json.dump([input_text], f)
+        # masks
+        save_folder_mask = os.path.join(save_folder, 'mask')
+        if not os.path.exists(save_folder_mask):
+            os.mkdir(save_folder_mask)
+        n_frames = len(pixel_values)
+        masks = data_dict['masks']
+        _, h, w = masks.shape
+        masks = masks.reshape(-1, n_frames, h, w)
+        for i_obj, obj_masks in enumerate(masks):
+            save_folder_mask_obj_folder = os.path.join(save_folder_mask, 'obj_{}'.format(i_obj))
+            if not os.path.exists(save_folder_mask_obj_folder):
+                os.mkdir(save_folder_mask_obj_folder)
+            for i_frame, f_mask in enumerate(obj_masks):
+                f_mask = f_mask.numpy()
+                f_mask = f_mask * 255
+                f_mask = np.stack([f_mask * 1, f_mask * 0, f_mask * 0], axis=2)
+                f_mask = show_images[i_frame] * 0.3 + 0.7 * f_mask
+                f_mask = f_mask.astype(np.uint8)
+                cv2.imwrite(os.path.join(save_folder_mask_obj_folder, '{}.png'.format(i_frame)), f_mask)
+        return

projects/llava_sam2/datasets/RefCOCO_Dataset.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+from typing import Literal
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+from xtuner.registry import BUILDER
+from xtuner.utils import IGNORE_INDEX
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import SEG_QUESTIONS, ANSWER_LIST
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from third_parts.mmdet.datasets.refcoco import RefCocoDataset
+from .utils import dynamic_preprocess
+class ReferSegmDataset(RefCocoDataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 data_root,
+                 ann_file=None,
+                 split_file=None,
+                 special_tokens=None,
+                 prompt_template=None,
+                 extra_image_processor=None,
+                 data_prefix=dict(img_path='train2014/'),
+                 tokenizer=None,
+                 max_length=2048,
+                 num_classes_per_sample=3,
+                 single_image_mode=False,
+                 arch_type: Literal['intern_vl', 'qwen'] = 'intern_vl',
+                 preprocessor=None,
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            pipeline=None,
+            ann_file=ann_file,
+            split_file=split_file,
+            **kwargs,
+        )
+        self.begin_str = f'{DEFAULT_IMAGE_TOKEN}\n'
+        if extra_image_processor is not None:
+            self.extra_image_processor = BUILDER.build(extra_image_processor)
+        self.arch_type = arch_type
+        if self.arch_type == 'qwen':
+            self.IMG_CONTEXT_TOKEN = '<|image_pad|>'
+            self.IMG_START_TOKEN = '<|vision_start|>'
+            self.IMG_END_TOKEN = '<|vision_end|>'
+        elif self.arch_type == 'llava':
+            self.IMG_CONTEXT_TOKEN = '<image>'
+            self.IMG_START_TOKEN = ''
+            self.IMG_END_TOKEN = ''
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.image_folder = data_root
+        self.template = prompt_template
+        self.max_length = max_length
+        if self.arch_type == 'intern_vl':
+            # self._system = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'
+            self._system = ''
+            self.template['INSTRUCTION'] = '<|user|>\n{input}<|end|><|assistant|>\n'
+        elif self.arch_type == 'qwen':
+            self._system = ''
+        elif self.arch_type == 'llava':
+            self._system = ''
+        self.num_classes_per_sample = num_classes_per_sample
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        if self.arch_type == 'llava':
+            self.downsample_ratio = 1
+        self.image_size = 448
+        if self.arch_type == 'llava':
+            self.image_size = 336
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        if preprocessor is None:
+            self.transformer = T.Compose([
+                T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+            ])
+            self.preprocessor = None
+        else:
+            self.transformer = None
+            self.preprocessor = BUILDER.build(preprocessor)
+        self.arch_type = arch_type
+        self.single_image_mode = single_image_mode
+        self._max_refetch = 1000
+        print("Image RES dataset, include {} items.".format(len(self)))
+    @property
+    def modality_length(self):
+        import pickle
+        length_list = []
+        for idx in range(len(self)):
+            length_list.append(100)
+        return length_list
+    def _parse_annotations(self, ann_info):
+        image_path = ann_info['img_path']
+        image = Image.open(image_path).convert('RGB')
+        width, height = image.size
+        masks, phrases = [], []
+        instances, text = ann_info['instances'], ann_info['text']
+        # index = np.random.choice(range(len(instances)), min(
+        #     len(instances), self.num_classes_per_sample))
+        index = np.random.choice(range(len(instances)), self.num_classes_per_sample, replace=True)
+        for idx in index:
+            inst = instances[idx]
+            phrase = text[idx].lower()
+            if '.' == phrase[-1]:
+                phrase = phrase[:-1]
+            phrases.append(phrase)
+            binary_mask = np.zeros((height, width), dtype=np.uint8)
+            for seg in inst["mask"]:
+                rles = mask_utils.frPyObjects([seg], height, width)
+                m = mask_utils.decode(rles)
+                m = m.astype(np.uint8)
+                binary_mask += m.squeeze()
+            masks.append(binary_mask)
+        conversation = []
+        for i, phrase in enumerate(phrases):
+            question = random.choice(SEG_QUESTIONS).format(class_name=phrase)
+            if i == 0:
+                question = self.begin_str + question
+            conversation.append({'from': 'human', 'value': question})
+            conversation.append({'from': 'gpt', 'value': random.choice(ANSWER_LIST)})
+        masks = torch.stack([torch.from_numpy(mask) for mask in masks], dim=0)
+        ann_info.update({
+            'masks': masks,
+            'conversations': conversation,
+            'image': image_path
+        })
+        return ann_info
+    def prepare_data(self, index):
+        data_dict = super().prepare_data(index)
+        data_dict = self._parse_annotations(data_dict)
+        if data_dict is None:
+            return None
+        out_data_dict = {}
+        if 'masks' in data_dict:
+            out_data_dict['masks'] = data_dict['masks']
+        if data_dict.get('image', None) is not None:
+            image_file = data_dict['image']
+            try:
+                image = Image.open(image_file).convert('RGB')
+            except Exception as e:
+                print(f'Error: {e}', flush=True)
+                print_log(f'Error: {e}', logger='current')
+                return None
+            if hasattr(self, 'extra_image_processor'):
+                g_image = np.array(image)  # for grounding
+                g_image = self.extra_image_processor.apply_image(g_image)
+                g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+                out_data_dict['g_pixel_values'] = g_pixel_values
+            if self.single_image_mode:
+                images = [image]
+            else:
+                images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                            self.max_dynamic_patch,
+                                            self.image_size, self.use_thumbnail)
+            if self.preprocessor is not None:
+                if self.arch_type == 'qwen':
+                    _data_dict = self.preprocessor(images, do_resize=True)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    _data_dict['image_grid_thw'] = torch.tensor(_data_dict['image_grid_thw'], dtype=torch.int)
+                    num_image_tokens = int(_data_dict['image_grid_thw'][0].prod() * (self.downsample_ratio ** 2))
+                elif self.arch_type == 'llava':
+                    _data_dict = self.preprocessor(images, do_resize=True, size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = np.stack(_data_dict['pixel_values'], axis=0)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    num_image_tokens = _data_dict['pixel_values'].shape[0] * self.patch_token
+                else:
+                    raise NotImplementedError
+                out_data_dict.update(_data_dict)
+            else:
+                pixel_values = [self.transformer(image) for image in images]
+                pixel_values = torch.stack(pixel_values)
+                out_data_dict['pixel_values'] = pixel_values
+                num_image_tokens = pixel_values.shape[0] * self.patch_token
+            image_token_str = f'{self.IMG_START_TOKEN}' \
+                              f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                              f'{self.IMG_END_TOKEN}'
+            token_dict = self.get_inputid_labels(data_dict['conversations'], image_token_str)
+            out_data_dict.update(token_dict)
+        else:
+            token_dict = self.get_inputid_labels(data_dict['conversations'], None)
+            out_data_dict.update(token_dict)
+            out_data_dict['pixel_values'] = torch.zeros(1, 3, self.image_size, self.image_size)
+        return out_data_dict
+    def get_inputid_labels(self, conversations, image_token_str) -> dict:
+        input = ''
+        out_conversation = []
+        while conversations and conversations[0]['from'] == 'gpt':
+            # Skip the first one if it is from gpt
+            conversations = conversations[1:]
+        for msg in conversations:
+            if msg['from'] == 'human':
+                if image_token_str is None and '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', '')
+                if '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', image_token_str).strip()
+                input += msg['value'].strip()
+            elif msg['from'] == 'gpt':
+                out_conversation.append({
+                    'input': input,
+                    'output': msg['value'].strip()
+                })
+                input = ''
+            else:
+                raise NotImplementedError
+        input_ids, labels = [], []
+        for i, single_turn_conversation in enumerate(out_conversation):
+            input = single_turn_conversation.get('input', '')
+            if input is None:
+                input = ''
+            input_text = self.template.INSTRUCTION.format(
+                input=input, round=i + 1)
+            if i == 0:
+                if self._system != '' and self._system is not None:
+                    system = self.template.SYSTEM.format(system=self._system)
+                    input_text = system + input_text
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=True)
+            else:
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=False)
+            input_ids += input_encode
+            labels += [IGNORE_INDEX] * len(input_encode)
+            output_text = single_turn_conversation.get('output', '')
+            if self.template.get('SUFFIX', None):
+                output_text += self.template.SUFFIX
+            output_encode = self.tokenizer.encode(
+                output_text, add_special_tokens=False)
+            input_ids += output_encode
+            labels += copy.deepcopy(output_encode)
+        if len(input_ids) > self.max_length:
+            input_ids = input_ids[:self.max_length]
+            labels = labels[:self.max_length]
+        # print('len_ids: ', len(input_ids))
+        return {'input_ids': input_ids, 'labels': labels}
+    def __getitem__(self, index):
+        for _ in range(self._max_refetch + 1):
+            data = self.prepare_data(index)
+            # Broken images may cause the returned data to be None
+            if data is None:
+                index = self._rand_another()
+                continue
+            return data
+if __name__ == '__main__':
+    from transformers import CLIPImageProcessor, AutoTokenizer
+    from third_parts.segment_anything.utils.transforms import ResizeLongestSide
+    pretrained_model = 'MBZUAI/GLaMM-GranD-Pretrained'
+    llm_name_or_path = 'lmsys/vicuna-7b-v1.5'
+    tokenizer = dict(
+        type=AutoTokenizer.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path)
+    image_processor = dict(
+        type=CLIPImageProcessor.from_pretrained,
+        pretrained_model_name_or_path='openai/clip-vit-large-patch14-336')
+    extra_image_processor = dict(
+        type=ResizeLongestSide,
+        target_length=1024,
+    )
+    from xtuner.utils.templates import PROMPT_TEMPLATE
+    prompt_template = PROMPT_TEMPLATE.vicuna
+    from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory, template_map_fn
+    from projects.glamm.datasets.collate_fns.glamm_collate_fn import glamm_collate_fn
+    dataset = ReferSegmDataset(
+        tokenizer=tokenizer,
+        special_tokens=['[SEG]'],
+        extra_image_processor=extra_image_processor,
+        prompt_template=prompt_template,
+        data_root='data/coco/',
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco+/instances.json',
+        split_file='refcoco+/refs(unc).p',
+    )
+    for i in range(1000):
+        dataset[i]

projects/llava_sam2/datasets/RefYoutubeVOS_Dataset.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from .ReVOS_Dataset import VideoReVOSDataset
+import json
+import pickle
+class VideoRefYoutubeVOSDataset(VideoReVOSDataset):
+    def json_file_preprocess(self, expression_file, mask_file):
+        # prepare expression annotation files
+        with open(expression_file, 'r') as f:
+            expression_datas = json.load(f)['videos']
+        metas = []
+        anno_count = 0  # serve as anno_id
+        vid2metaid = {}
+        for vid_name in expression_datas:
+            vid_express_data = expression_datas[vid_name]
+            vid_frames = sorted(vid_express_data['frames'])
+            vid_len = len(vid_frames)
+            exp_id_list = sorted(list(vid_express_data['expressions'].keys()))
+            for exp_id in exp_id_list:
+                exp_dict = vid_express_data['expressions'][exp_id]
+                meta = {}
+                meta['video'] = vid_name
+                meta['exp'] = exp_dict['exp']  # str
+                meta['mask_anno_id'] = [str(anno_count), ]
+                if 'obj_id' in exp_dict.keys():
+                    meta['obj_id'] = exp_dict['obj_id']
+                else:
+                    meta['obj_id'] = [0, ]  # Ref-Youtube-VOS only has one object per expression
+                meta['anno_id'] = [str(anno_count), ]
+                anno_count += 1
+                meta['frames'] = vid_frames
+                meta['exp_id'] = exp_id
+                meta['length'] = vid_len
+                metas.append(meta)
+                if vid_name not in vid2metaid.keys():
+                    vid2metaid[vid_name] = []
+                vid2metaid[vid_name].append(len(metas) - 1)
+        # process mask annotation files
+        with open(mask_file, 'rb') as f:
+            mask_dict = pickle.load(f)
+        return vid2metaid, metas, mask_dict

projects/llava_sam2/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .collect_fns import video_lisa_collate_fn
+from .MeVIS_Dataset import VideoMeVISDataset
+from .ReVOS_Dataset import VideoReVOSDataset
+from .RefYoutubeVOS_Dataset import VideoRefYoutubeVOSDataset
+from .encode_fn import video_lisa_encode_fn
+from .RefCOCO_Dataset import ReferSegmDataset
+from .ReSAM2_Dataset import VideoSAM2Dataset
+from .vqa_dataset import LLaVADataset, InfinityMMDataset
+from .GCG_Dataset import GranDfGCGDataset, FlickrGCGDataset, OpenPsgGCGDataset, RefCOCOgGCGDataset
+from .Grand_Dataset import GranDDataset
+from .Osprey_Dataset import OspreyDataset, OspreyDescriptionDataset, OspreyShortDescriptionDataset
+from .ChatUniVi_Dataset import VideoChatUniViDataset

projects/llava_sam2/datasets/collect_fns.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from typing import Dict, Sequence
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from xtuner.parallel.sequence import (get_sequence_parallel_world_size,
+                                      pad_for_sequence_parallel)
+from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX
+def video_lisa_collate_fn(instances: Sequence[Dict],
+                       pad_index: int = DEFAULT_PAD_TOKEN_INDEX,
+                       return_hf_format: bool = False,
+                       use_varlen_attn: bool = False):
+    seq_parallel_world_size = get_sequence_parallel_world_size()
+    input_ids, labels = [], []
+    has_image = any(inst.get('pixel_values') is not None for inst in instances)
+    has_pe = any(inst.get('image_grid_thw', None) is not None for inst in instances)
+    has_fast_image = any(inst.get('fast_pixel_values', None) is not None for inst in instances)
+    has_grounding_image = any(inst.get('g_pixel_values') is not None for inst in instances)
+    has_mask = any(inst.get('masks') is not None for inst in instances)
+    has_bboxes = any(inst.get('bboxes') is not None for inst in instances)
+    has_points = any(inst.get('points') is not None for inst in instances)
+    has_fast_exists = any(inst.get('fast_exists') is not None for inst in instances)
+    has_vp = any(inst.get('vp_overall_mask') is not None for inst in instances)
+    has_prompt_mask = any(inst.get('prompt_masks') is not None for inst in instances)
+    if use_varlen_attn:
+        position_ids, cumulative_len = [], []
+        assert len(instances) == 1, (
+            f'If utilizing varlen attention, the batch size should be'
+            f' set to 1, but got {len(instances)}')
+        assert not has_image, 'Currently, it is not configured to '
+        'accommodate the use of varlen Attention in multimodal training'
+    if has_image:
+        pixel_values = []
+        frames_per_batch = []
+        image_grid_thw = []
+    if has_grounding_image:
+        grounding_pixel_values = []
+    if has_mask:
+        object_masks = []
+    if has_bboxes:
+        object_bboxes = []
+    if has_points:
+        prompt_points = []
+    if has_fast_image:
+        fast_pixel_values = []
+    if has_fast_exists:
+        fast_exists = []
+    if has_vp:
+        vp_overall_mask = []
+    else:
+        vp_overall_mask = None
+    if has_prompt_mask:
+        prompt_masks = []
+    else:
+        prompt_masks = None
+    for example in instances:
+        input_ids.append(torch.LongTensor(example['input_ids']))
+        labels.append(torch.LongTensor(example['labels']))
+        if use_varlen_attn:
+            cumulative_len.append(torch.IntTensor(example['cumulative_len']))
+            position_ids.append(torch.LongTensor(example['position_ids']))
+        if has_image:
+            pixel_values.append(example['pixel_values'])
+            if has_pe:
+                image_grid_thw.append(example['image_grid_thw'])
+            if has_vp:
+                if 'vp_overall_mask' in example.keys() and example['vp_overall_mask'] is not None:
+                    vp_overall_mask.append(example['vp_overall_mask'])
+                else:
+                    vp_overall_mask.append(torch.Tensor([False] * len(pixel_values[-1])))
+        if has_fast_image:
+            if 'fast_pixel_values' in example.keys() and example['fast_pixel_values'] is not None:
+                fast_pixel_values.append(example['fast_pixel_values'])
+        if has_fast_exists:
+            if 'fast_exists' in example.keys() and example['fast_exists'] is not None:
+                fast_exists.append(example['fast_exists'])
+        if has_grounding_image and 'g_pixel_values' in example.keys():
+            if isinstance(example['g_pixel_values'], list):
+                grounding_pixel_values += example['g_pixel_values']
+                frames_per_batch.append(len(example['g_pixel_values']))
+            else:
+                grounding_pixel_values.append(example['g_pixel_values'])
+                frames_per_batch.append(1)
+        if has_mask:
+            if 'masks' in example.keys() and example['masks'] is not None:
+                if isinstance(example['masks'], list):
+                    if isinstance(example['masks'][0], np.ndarray):
+                        _masks = np.stack(example['masks'], axis=0)
+                        _masks = torch.from_numpy(_masks)
+                        object_masks.append(_masks)
+                    else:
+                        object_masks.append(torch.stack(example['masks'], dim=0))
+                else:
+                    object_masks.append(example['masks'])
+        if has_bboxes:
+            if 'bboxes' in example.keys() and example['bboxes'] is not None:
+                object_bboxes.append(example['bboxes'])
+        if has_points:
+            if 'points' in example.keys() and example['points'] is not None:
+                prompt_points.append(example['points'])
+        if has_prompt_mask:
+            if 'prompt_masks' in example.keys():
+                prompt_masks.append(example['prompt_masks'])
+    ori_length = [len(ids) for ids in input_ids]
+    if len(instances) > 1:
+        input_ids = pad_sequence(
+            input_ids, batch_first=True, padding_value=pad_index)
+        labels = pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX)
+    else:
+        input_ids = torch.stack(input_ids)
+        labels = torch.stack(labels)
+    if use_varlen_attn:
+        assert input_ids.size(1) % seq_parallel_world_size == 0
+        attention_mask = None
+        position_ids = torch.stack(position_ids, dim=0)
+    else:
+        # Some tokenizers have the same eos token and pad token, so input_ids
+        # cannot be masked directly based on the pad token id.
+        attention_mask = torch.zeros_like(input_ids).bool()
+        for i, length in enumerate(ori_length):
+            attention_mask[i, :length] = True
+        bs, seq_len = input_ids.shape
+        position_ids = torch.arange(seq_len).unsqueeze(0).long().repeat(bs, 1)
+    if seq_parallel_world_size > 1:
+        input_ids = pad_for_sequence_parallel(input_ids, pad_index)
+        labels = pad_for_sequence_parallel(labels, IGNORE_INDEX)
+        position_ids = pad_for_sequence_parallel(position_ids, 0)
+        if attention_mask is not None:
+            attention_mask = pad_for_sequence_parallel(attention_mask, 0)
+    if use_varlen_attn:
+        max_seqlen = (
+            cumulative_len[0][1:] -  # noqa: W504
+            cumulative_len[0][:-1]).max().item()
+        data_dict = {
+            'input_ids': input_ids,
+            'cumulative_len': cumulative_len,
+            'position_ids': position_ids,
+            'labels': labels,
+            'max_seqlen': max_seqlen
+        }
+    else:
+        data_dict = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'labels': labels
+        }
+    if has_image:
+        if all(x.shape == pixel_values[0].shape for x in pixel_values):
+            pixel_values = torch.stack(pixel_values, dim=0)
+        data_dict['frames_per_batch'] = frames_per_batch
+        data_dict['pixel_values'] = pixel_values
+        if has_pe:
+            data_dict['image_grid_thw'] = image_grid_thw
+    if has_fast_image:
+        if all(x.shape == fast_pixel_values[0].shape for x in fast_pixel_values):
+            fast_pixel_values = torch.stack(fast_pixel_values, dim=0)
+        data_dict['fast_pixel_values'] = fast_pixel_values
+    if has_fast_exists:
+        data_dict['fast_exists'] = fast_exists
+    if has_vp:
+        data_dict['vp_overall_mask'] = torch.cat(vp_overall_mask, dim=0)
+    if has_prompt_mask:
+        data_dict['prompt_masks'] = prompt_masks
+    if has_grounding_image:
+        # if all(x.shape == grounding_pixel_values[0].shape for x in grounding_pixel_values):
+            # grounding_pixel_values = torch.stack(grounding_pixel_values, dim=0)
+        data_dict['g_pixel_values'] = grounding_pixel_values
+    if has_mask:
+        data_dict['masks'] = object_masks
+    if has_bboxes:
+        data_dict['bboxes'] = object_bboxes
+    if has_points:
+        data_dict['points'] = prompt_points
+    if return_hf_format:
+        return data_dict
+    else:
+        return {'data': data_dict, 'data_samples': None}

projects/llava_sam2/datasets/encode_fn.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import copy
+from xtuner.dataset.utils import get_bos_eos_token_ids
+from xtuner.utils import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+def video_lisa_encode_fn(
+        example,
+        tokenizer,
+        max_length,
+        input_ids_with_output=True,
+        **kwargs
+):
+    """We only support the following three scenarios:
+    1. Incremental pretraining dataset.
+        example['conversation'] = [
+                {
+                    'input': '',
+                    'output': '### Human: Can you write xxx'
+                }
+            ]
+    2. Single-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                }
+            ]
+    3. Multi-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                },
+                {
+                    'input': 'Please expand on the second point.',
+                    'output': 'Here is an expanded explanation of the xxx'
+                }
+            ]
+    """
+    bos_token_id, eos_token_id = get_bos_eos_token_ids(tokenizer)
+    is_multi_turn_conversation = len(example['conversation']) > 1
+    if is_multi_turn_conversation:
+        assert input_ids_with_output
+    input_ids, labels = [], []
+    next_needs_bos_token = True
+    for single_turn_conversation in example['conversation']:
+        input = single_turn_conversation['input']
+        input_encode = tokenizer.encode(input, add_special_tokens=False)
+        if next_needs_bos_token:
+            input_ids += bos_token_id
+            labels += [IGNORE_INDEX] * len(bos_token_id)
+        input_ids += input_encode
+        labels += [IGNORE_INDEX] * len(input_encode)
+        if input_ids_with_output:
+            # Add output
+            output_with_loss = single_turn_conversation.get(
+                'output_with_loss', True)
+            output = single_turn_conversation['output']
+            output_encode = tokenizer.encode(output, add_special_tokens=False)
+            input_ids += output_encode
+            if output_with_loss:
+                labels += copy.deepcopy(output_encode)
+            else:
+                labels += [IGNORE_INDEX] * len(output_encode)
+            # Add EOS_TOKEN (with loss)
+            if single_turn_conversation.get('need_eos_token', True):
+                next_needs_bos_token = True
+                input_ids += eos_token_id
+                if output_with_loss:
+                    labels += copy.deepcopy(eos_token_id)
+                else:
+                    labels += [IGNORE_INDEX] * len(eos_token_id)
+            else:
+                next_needs_bos_token = False
+            # Add SEP (without loss)
+            sep = single_turn_conversation.get('sep', '')
+            if sep != '':
+                sep_encode = tokenizer.encode(sep, add_special_tokens=False)
+                input_ids += sep_encode
+                labels += [IGNORE_INDEX] * len(sep_encode)
+    if len(input_ids) > max_length:
+        input_ids = input_ids[:max_length]
+        labels = labels[:max_length]
+    return {'input_ids': input_ids, 'labels': labels}
+def video_lisa_encode_multi_conv_fn(
+        example,
+        tokenizer,
+        max_length,
+        input_ids_with_output=True
+):
+    """We only support the following three scenarios:
+    1. Incremental pretraining dataset.
+        example['conversation'] = [
+                {
+                    'input': '',
+                    'output': '### Human: Can you write xxx'
+                }
+            ]
+    2. Single-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                }
+            ]
+    3. Multi-turn conversation dataset.
+        example['conversation'] = [
+                {
+                    'input': 'Give three tips for staying healthy.',
+                    'output': '1.Eat a balanced diet xxx'
+                },
+                {
+                    'input': 'Please expand on the second point.',
+                    'output': 'Here is an expanded explanation of the xxx'
+                }
+            ]
+    """
+    bos_token_id, eos_token_id = get_bos_eos_token_ids(tokenizer)
+    assert not input_ids_with_output
+    input_id_list = []
+    for conv in example['conversation']:
+        input_ids = []
+        next_needs_bos_token = True
+        for single_turn_conversation in conv:
+            input = single_turn_conversation['input']
+            input_encode = tokenizer.encode(input, add_special_tokens=False)
+            if next_needs_bos_token:
+                input_ids += bos_token_id
+            input_ids += input_encode
+        if len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+        input_id_list.append(input_ids)
+    return {'input_ids': input_id_list}

projects/llava_sam2/datasets/gcg_process.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import numpy as np
+import random
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+GCG_QUESTIONS = [
+    DEFAULT_IMAGE_TOKEN + 'Could you please give me a brief description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    DEFAULT_IMAGE_TOKEN + 'Can you provide a brief description of the this image? Please output with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Please briefly describe the contents of the image. Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    DEFAULT_IMAGE_TOKEN + 'Could you give a brief explanation of what can be found within this picture? Please output with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Could you give me an brief explanation of this picture? Please respond with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Could you provide me with a briefly analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer.',
+]
+def refcocog_parse_annotations(example):
+    # example {'id': str, 'refs': [{"setence", 'bbox', 'segmentation'},], 'img_file_name': str, 'caption': str}
+    annotations = {'labels': [], 'caption': [], 'masks': [], 'tokens_positive': [],
+                   'file_name': example['img_file_name'], 'image': example['img_file_name']}
+    orig_caption = example['caption'].strip('"').strip()
+    annotations['caption'] = orig_caption.lower()
+    for detail in example['refs']:
+        phrase = detail['sentence']
+        if phrase.lower() in annotations['caption']:
+            annotations['labels'].append(phrase)
+            index = annotations['caption'].find(phrase)
+            end_index = index + len(phrase) if index != -1 else -1
+            annotations['tokens_positive'].append([index, end_index])
+            # still polygon or rle
+            annotations['masks'].append(detail["segmentation"])
+    # Sort tokens_positive and corresponding lists
+    tokens_positive = annotations['tokens_positive']
+    sorted_indices = sorted(range(len(tokens_positive)), key=lambda i: tokens_positive[i][0])
+    annotations['tokens_positive'] = [tokens_positive[i] for i in sorted_indices]
+    annotations['masks'] = [annotations['masks'][i] for i in sorted_indices]
+    annotations['labels'] = [annotations['labels'][i] for i in sorted_indices]
+    # Trimming overlapping intervals
+    for i in range(len(tokens_positive)):
+        for j in range(i + 1, len(tokens_positive)):
+            # If there is overlap
+            if tokens_positive[i][1] >= tokens_positive[j][0]:
+                # Modify the end index of phrase i to be one less than the start index of phrase j
+                tokens_positive[i][1] = tokens_positive[j][0] - 1
+                # Modify the phrases to reflect the change in indices
+                annotations['labels'][i] = orig_caption[tokens_positive[i][0]:tokens_positive[i][1] + 1]
+                break  # Exit inner loop since i was modified
+    return annotations
+def refcocog_conversation(caption, tokens_positive):
+    # insert <p> </p> and [seg] to caption and select a question
+    question = random.choice(GCG_QUESTIONS).strip()
+    # Prepare caption with tags
+    def tag_caption(caption, tokens):
+        for start, end in sorted(tokens, key=lambda x: x[0], reverse=True):
+            caption = f"{caption[:start]}<p> {caption[start:end]} </p> [SEG]{caption[end:]}"
+        return caption
+    detailed_answer = tag_caption(caption, tokens_positive)
+    conversations = [{'from': 'human', 'value': question}, {'from': 'gpt', 'value': detailed_answer}]
+    return conversations
+def refcocog_preprocess(example):
+    data_labels = example['labels']
+    masks = example['masks']
+    caption = example['caption']
+    tokens_positive = example['tokens_positive']
+    # Function to sort elements based on the start index of each phrase
+    def sort_by_start_index(items, order):
+        return [items[i] for i in order]
+    # Sort phrases based on their appearance in the sentence
+    phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+    masks = sort_by_start_index(masks, phrase_order)
+    data_labels = sort_by_start_index(data_labels, phrase_order)
+    tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+    conversations = refcocog_conversation(caption, tokens_positive)
+    example['conversations'] = conversations
+    example['labels'] = data_labels
+    example['masks'] = masks
+    example['tokens_positive'] = tokens_positive
+    return example
+def glamm_refcocog_map_fn(example):
+    # example {'id': str, 'refs': [{"setence", 'bbox', 'segmentation'},], 'img_file_name': str, 'caption': str}
+    example = refcocog_parse_annotations(example)
+    # example 'labels': [], 'caption': str, 'masks': [], 'tokens_positive': [], 'file_name': image_file
+    example = refcocog_preprocess(example)
+    # do llava preprocess
+    messages = example['conversations']
+    input = ''
+    conversation = []
+    while messages and messages[0]['from'] == 'gpt':
+        # Skip the first one if it is from gpt
+        messages = messages[1:]
+    for msg in messages:
+        if msg['from'] == 'human':
+            if DEFAULT_IMAGE_TOKEN in msg['value']:
+                msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                    '').strip()
+                msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                msg['value'] = msg['value'].strip()
+            input += msg['value']
+        elif msg['from'] == 'gpt':
+            conversation.append({'input': input, 'output': msg['value']})
+            input = ''
+        else:
+            raise NotImplementedError
+    example.update({'conversation': conversation})
+    return example
+def grandf_parse_annotations(example):
+    image_path = example['file_name']
+    annotations = {
+        'labels': [], 'caption': [], 'masks': [],
+        'tokens_positive': [], 'file_name': image_path,
+        'image': image_path}
+    annotations['caption'] = example['caption'].strip('"').strip()
+    for word, grounding in example["groundings"].items():
+        if grounding is None:
+            continue
+        annotations['labels'].append(word)
+        annotations['tokens_positive'].append(grounding["token_positives"])
+        annotations['masks'].append(grounding["rle_masks"])
+    return annotations
+def grandf_conversation(caption, tokens_positive):
+    question = random.choice(GCG_QUESTIONS).strip()
+    # Prepare caption with tags
+    def tag_caption(caption, tokens):
+        for start, end in sorted(tokens, key=lambda x: x[0], reverse=True):
+            caption = f"{caption[:start]}<p> {caption[start:end]} </p> [SEG]{caption[end:]}"
+        return caption
+    detailed_answer = tag_caption(caption, tokens_positive)
+    conversations = [{'from': 'human', 'value': question}, {'from': 'gpt', 'value': detailed_answer}]
+    return conversations
+def grandf_preprocess(example):
+    data_labels = example['labels']
+    masks = example['masks']
+    caption = example['caption']
+    tokens_positive = example['tokens_positive']
+    # Function to sort elements based on the start index of each phrase
+    def sort_by_start_index(items, order):
+        return [items[i] for i in order]
+    # Sort phrases based on their appearance in the sentence
+    phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+    masks = sort_by_start_index(masks, phrase_order)
+    data_labels = sort_by_start_index(data_labels, phrase_order)
+    tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+    conversations = grandf_conversation(caption, tokens_positive)
+    example['conversations'] = conversations
+    example['labels'] = data_labels
+    example['masks'] = masks
+    example['tokens_positive'] = tokens_positive
+    return example
+def glamm_granf_map_fn(example):
+    # example {'file_name': str, "height": int, "width": int, "image_id": str, caption: "str",
+    # "groundings": {ground_words: {'token_positives', 'rle_masks', }}}
+    example = grandf_parse_annotations(example)
+    # example 'labels': [], 'caption': str, 'masks': [], 'tokens_positive': [], 'file_name': image_file
+    example = grandf_preprocess(example)
+    # do llava preprocess
+    messages = example['conversations']
+    input = ''
+    conversation = []
+    while messages and messages[0]['from'] == 'gpt':
+        # Skip the first one if it is from gpt
+        messages = messages[1:]
+    for msg in messages:
+        if msg['from'] == 'human':
+            if DEFAULT_IMAGE_TOKEN in msg['value']:
+                msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                    '').strip()
+                msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                msg['value'] = msg['value'].strip()
+            input += msg['value']
+        elif msg['from'] == 'gpt':
+            conversation.append({'input': input, 'output': msg['value']})
+            input = ''
+        else:
+            raise NotImplementedError
+    example.update({'conversation': conversation})
+    return example
+glamm_openpsg_map_fn = glamm_granf_map_fn
+def flickr_parse_annotations(example):
+    annotations = {'bboxes': [], 'labels': [], 'bboxes_ignore': [], 'caption': example['caption'], 'masks': [],
+                   'tokens_positive': [], 'image': example['file_name']}
+    ann_info = example["ann_info"]
+    for ann in ann_info:
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        inter_w = max(0, min(x1 + w, example['width']) - max(x1, 0))
+        inter_h = max(0, min(y1 + h, example['height']) - max(y1, 0))
+        if inter_w * inter_h == 0 or ann['area'] <= 0 or w < 1 or h < 1:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        annotations['bboxes'].append(bbox)
+        tokens_positive = ann['tokens_positive']
+        gt_label = [example['caption'][span[0]:span[1]] for span in tokens_positive]
+        annotations['labels'].append(gt_label[0])
+        annotations['tokens_positive'].append(tokens_positive[0])
+        rle = ann['sam_mask']
+        annotations['masks'].append(rle)
+    # Convert bounding boxes to numpy arrays
+    annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+        'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+    annotations['bboxes_ignore'] = np.array(annotations['bboxes_ignore'], dtype=np.float32) if annotations[
+        'bboxes_ignore'] else np.zeros((0, 4), dtype=np.float32)
+    return annotations
+def flickr_preprocess(example):
+    data_labels = example['labels']
+    masks = example['masks']
+    caption = example['caption']
+    tokens_positive = example['tokens_positive']
+    # Function to sort elements based on the start index of each phrase
+    def sort_by_start_index(items, order):
+        return [items[i] for i in order]
+    # Sort phrases based on their appearance in the sentence
+    phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+    masks = sort_by_start_index(masks, phrase_order)
+    data_labels = sort_by_start_index(data_labels, phrase_order)
+    tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+    conversations = grandf_conversation(caption, tokens_positive)
+    example['conversations'] = conversations
+    example['labels'] = data_labels
+    example['masks'] = masks
+    example['tokens_positive'] = tokens_positive
+    return example
+def glamm_flickr_map_fn(example):
+    # example {'file_name': str, "height": int, "width": int, "image_id": str, caption: "str",
+    # "groundings": {ground_words: {'token_positives', 'rle_masks', }}}
+    example = flickr_parse_annotations(example)
+    example = flickr_preprocess(example)
+    # do llava preprocess
+    messages = example['conversations']
+    input = ''
+    conversation = []
+    while messages and messages[0]['from'] == 'gpt':
+        # Skip the first one if it is from gpt
+        messages = messages[1:]
+    for msg in messages:
+        if msg['from'] == 'human':
+            if DEFAULT_IMAGE_TOKEN in msg['value']:
+                msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                    '').strip()
+                msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                msg['value'] = msg['value'].strip()
+            input += msg['value']
+        elif msg['from'] == 'gpt':
+            conversation.append({'input': input, 'output': msg['value']})
+            input = ''
+        else:
+            raise NotImplementedError
+    example.update({'conversation': conversation})
+    return example

projects/llava_sam2/datasets/grand_process.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import numpy as np
+import random
+from xtuner.utils import DEFAULT_IMAGE_TOKEN
+GCG_QUESTIONS = [
+    DEFAULT_IMAGE_TOKEN + 'Could you please give me a brief description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    DEFAULT_IMAGE_TOKEN + 'Can you provide a brief description of the this image? Please output with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Please briefly describe the contents of the image. Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    DEFAULT_IMAGE_TOKEN + 'Could you give a brief explanation of what can be found within this picture? Please output with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Could you give me an brief explanation of this picture? Please respond with interleaved segmentation masks for the corresponding phrases.',
+    DEFAULT_IMAGE_TOKEN + 'Could you provide me with a briefly analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer.',
+]
+def grand_parse_annotations(example):
+    annotations = {
+        'caption': [], 'masks': [],
+        'tokens_positive': [], 'labels': []}
+    annotations['caption'] = example['dense_caption']['caption'].strip('"').strip()
+    object_infos = example['dense_caption']['details']
+    all_seg_objects_dict = {}
+    for seg_object_dict in example["objects"]:
+        all_seg_objects_dict[seg_object_dict['id']] = seg_object_dict
+    for seg_object_dict in example["floating_objects"]:
+        all_seg_objects_dict[seg_object_dict['id']] = seg_object_dict
+    for object_info in object_infos:
+        ids = object_info["ids"]
+        if object_info["tokens_positive"] is None:
+            continue
+        annotations['labels'].append(object_info["phrase"])
+        annotations['tokens_positive'].append(object_info["tokens_positive"])
+        _masks = []
+        for _id in ids:
+            _masks.append(all_seg_objects_dict[_id]['segmentation'])
+        annotations['masks'].append(_masks)
+    return annotations
+def grand_conversation(caption, tokens_positive):
+    question = random.choice(GCG_QUESTIONS).strip()
+    # Prepare caption with tags
+    def tag_caption(caption, tokens):
+        for start, end in sorted(tokens, key=lambda x: x[0], reverse=True):
+            caption = f"{caption[:start]}<p> {caption[start:end]} </p> [SEG]{caption[end:]}"
+        return caption
+    detailed_answer = tag_caption(caption, tokens_positive)
+    conversations = [{'from': 'human', 'value': question}, {'from': 'gpt', 'value': detailed_answer}]
+    return conversations
+def grand_preprocess(example):
+    data_labels = example['labels']
+    masks = example['masks']
+    caption = example['caption']
+    tokens_positive = example['tokens_positive']
+    # Function to sort elements based on the start index of each phrase
+    def sort_by_start_index(items, order):
+        return [items[i] for i in order]
+    # Sort phrases based on their appearance in the sentence
+    phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+    masks = sort_by_start_index(masks, phrase_order)
+    data_labels = sort_by_start_index(data_labels, phrase_order)
+    tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+    conversations = grand_conversation(caption, tokens_positive)
+    example['conversations'] = conversations
+    example['labels'] = data_labels
+    example['masks'] = masks
+    example['tokens_positive'] = tokens_positive
+    return example
+def glamm_grand_map_fn(example):
+    # example {'file_name': str, "height": int, "width": int, "image_id": str, caption: "str",
+    # "groundings": {ground_words: {'token_positives', 'rle_masks', }}}
+    example = grand_parse_annotations(example)
+    # example 'labels': [], 'caption': str, 'masks': [], 'tokens_positive': [], 'file_name': image_file
+    example = grand_preprocess(example)
+    # do llava preprocess
+    messages = example['conversations']
+    input = ''
+    conversation = []
+    while messages and messages[0]['from'] == 'gpt':
+        # Skip the first one if it is from gpt
+        messages = messages[1:]
+    for msg in messages:
+        if msg['from'] == 'human':
+            if DEFAULT_IMAGE_TOKEN in msg['value']:
+                msg['value'] = msg['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                    '').strip()
+                msg['value'] = DEFAULT_IMAGE_TOKEN + '\n' + msg['value']
+                msg['value'] = msg['value'].strip()
+            input += msg['value']
+        elif msg['from'] == 'gpt':
+            conversation.append({'input': input, 'output': msg['value']})
+            input = ''
+        else:
+            raise NotImplementedError
+    example.update({'conversation': conversation})
+    return example

projects/llava_sam2/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1) for j in range(1, n + 1)
+                     if i * j <= max_num and i * j >= min_num}
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images

projects/llava_sam2/datasets/vqa_dataset.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import copy
+import random
+import glob
+import json
+import logging
+import os
+from typing import Literal
+import torch
+from mmengine import print_log
+from mmengine.config import Config, ConfigDict
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+import torch.nn.functional as F
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+from xtuner.registry import BUILDER
+from xtuner.utils import IGNORE_INDEX
+from xtuner.dataset.utils import encode_fn
+from xtuner.dataset.map_fns import llava_map_fn
+from projects.glamm.datasets.utils.utils import expand2square
+from projects.glamm.datasets.utils.utils import SEG_QUESTIONS, ANSWER_LIST
+from projects.glamm.utils import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from .utils import dynamic_preprocess
+class InfinityMMDataset(Dataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 tokenizer,
+                 data_path,
+                 prompt_template,
+                 special_tokens=None,
+                 max_length=8192,
+                 offline_save_path='./work_dirs/infinityMM.json',
+                 ):
+        self.offline_save_path = offline_save_path
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self._system = ''
+        self.template = prompt_template
+        self.max_length = max_length
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_token = int(
+            (self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB')
+            if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size),
+                     interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        self.data = self._load_annotations(data_path)
+        self._max_refetch = 1000
+    def _load_annotations(self, data_path):
+        if os.path.exists(self.offline_save_path):
+            with open(self.offline_save_path, 'r') as f:
+                ret = json.load(f)
+            print(f"Load InfinityMM file list from {self.offline_save_path}, {len(ret)} items !!!")
+            return ret
+        sub_folders = []
+        for sub_folder in os.listdir(data_path):
+            if '.' not in sub_folder:
+                # a folder
+                if "LVIS_111k" in sub_folder:
+                    # special case, have subsub folder
+                    subsub_folders = os.listdir(os.path.join(data_path, sub_folder))
+                    for subsub_folder in subsub_folders:
+                        sub_folders.append(os.path.join(data_path, sub_folder, subsub_folder))
+                else:
+                    sub_folders.append(os.path.join(data_path, sub_folder))
+        all_jsons = []
+        for sub_folder in sub_folders:
+            print(f"Processing {sub_folder} !!!")
+            _files = os.listdir(sub_folder)
+            _num = 0
+            for _file in _files:
+                if '.json' in _file:
+                    _json_path = os.path.join(sub_folder, _file)
+                    _num += 1
+                    all_jsons.append(os.path.join(sub_folder, _file))
+            print(f"Finished {sub_folder} has {_num} items.")
+        with open(self.offline_save_path, 'w') as f:
+            json.dump(all_jsons, f)
+        return all_jsons
+    def __getitem__(self, index):
+        for _ in range(self._max_refetch + 1):
+            data = self.prepare_data(index)
+            # Broken images may cause the returned data to be None
+            if data is None:
+                index = self._rand_another()
+                continue
+            return data
+    def __len__(self):
+        return len(self.data)
+    @property
+    def modality_length(self):
+        self.group_length = []
+        for data_dict in self.data:
+            self.group_length.append(100)
+        return self.group_length
+    @property
+    def length(self):
+        group_length = np.array(self.group_length)
+        group_length = np.abs(group_length).tolist()
+        return group_length
+    def prepare_data(self, index):
+        data_path = self.data[index]
+        with open(data_path, 'r') as f:
+            data_dict = json.load(f)
+        if 'image' in data_dict.keys():
+            data_dict['image'] = data_path.replace('.json', '.jpg')
+        if data_dict is None:
+            return None
+        out_data_dict = {}
+        if data_dict.get('image', None) is not None:
+            image_file = data_dict['image']
+            try:
+                image = Image.open(image_file).convert('RGB')
+            except Exception as e:
+                print(f'Error: {e}', flush=True)
+                print_log(f'Error: {e}', logger='current')
+                return None
+            images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                        self.max_dynamic_patch,
+                                        self.image_size, self.use_thumbnail)
+            pixel_values = [self.transformer(image) for image in images]
+            pixel_values = torch.stack(pixel_values)
+            out_data_dict['pixel_values'] = pixel_values
+            num_image_tokens = pixel_values.shape[0] * self.patch_token
+            image_token_str = f'{self.IMG_START_TOKEN}' \
+                              f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                              f'{self.IMG_END_TOKEN}'
+            token_dict = self.get_inputid_labels(
+                data_dict['conversations'], image_token_str)
+            out_data_dict.update(token_dict)
+        else:
+            token_dict = self.get_inputid_labels(
+                data_dict['conversations'], None)
+            out_data_dict.update(token_dict)
+            out_data_dict['pixel_values'] = torch.zeros(
+                1, 3, self.image_size, self.image_size)
+        return out_data_dict
+    def _rand_another(self) -> int:
+        return np.random.randint(0, len(self.data))
+    def get_inputid_labels(self, conversations, image_token_str) -> dict:
+        input = ''
+        out_conversation = []
+        while conversations and conversations[0]['from'] == 'gpt':
+            # Skip the first one if it is from gpt
+            conversations = conversations[1:]
+        for i, msg in enumerate(conversations):
+            if msg['from'] == 'human':
+                # change to 1 image
+                if '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>\n', '').replace('<image>', '')
+                    if i == 0:
+                        msg['value'] = "<image>\n" + msg['value']
+                if image_token_str is None and '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', '')
+                if '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', image_token_str).strip()
+                input += msg['value'].strip()
+            elif msg['from'] == 'gpt':
+                out_conversation.append({
+                    'input': input,
+                    'output': msg['value'].strip()
+                })
+                input = ''
+            else:
+                raise NotImplementedError
+        input_ids, labels = [], []
+        for i, single_turn_conversation in enumerate(out_conversation):
+            input = single_turn_conversation.get('input', '')
+            if input is None:
+                input = ''
+            input_text = self.template.INSTRUCTION.format(
+                input=input, round=i + 1)
+            if i == 0:
+                if self._system != '' and self._system is not None:
+                    system = self.template.SYSTEM.format(system=self._system)
+                    input_text = system + input_text
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=True)
+            else:
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=False)
+            input_ids += input_encode
+            labels += [IGNORE_INDEX] * len(input_encode)
+            output_text = single_turn_conversation.get('output', '')
+            if self.template.get('SUFFIX', None):
+                output_text += self.template.SUFFIX
+            output_encode = self.tokenizer.encode(
+                output_text, add_special_tokens=False)
+            input_ids += output_encode
+            labels += copy.deepcopy(output_encode)
+        if len(input_ids) > self.max_length:
+            input_ids = input_ids[:self.max_length]
+            labels = labels[:self.max_length]
+            print_log(
+                f'Warning: input_ids length({len(input_ids)}) '
+                f'is longer than max_length, cut to {self.max_length}',
+                logger='current')
+        return {'input_ids': input_ids, 'labels': labels}
+class LLaVADataset(Dataset):
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+    IMG_START_TOKEN = '<img>'
+    IMG_END_TOKEN = '</img>'
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def __init__(self,
+                 tokenizer,
+                 data_path,
+                 prompt_template,
+                 special_tokens=None,
+                 image_folder=None,
+                 max_length=8192,
+                 arch_type: Literal['intern_vl', 'qwen'] = 'intern_vl',
+                 preprocessor=None,
+                 skip_pure_text=False,
+                 ):
+        self.tokenizer = BUILDER.build(tokenizer)
+        if special_tokens is not None:
+            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
+        self.image_folder = image_folder
+        self.template = prompt_template
+        self.max_length = max_length
+        self._system = ''
+        self.arch_type = arch_type
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        if self.arch_type == 'llava':
+            self.downsample_ratio = 1
+        self.image_size = 448
+        if self.arch_type == 'llava':
+            self.image_size = 336
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_token = int(
+            (self.image_size // patch_size)**2 * (self.downsample_ratio**2))
+        if self.arch_type == 'qwen':
+            self.IMG_CONTEXT_TOKEN = '<|image_pad|>'
+            self.IMG_START_TOKEN = '<|vision_start|>'
+            self.IMG_END_TOKEN = '<|vision_end|>'
+        elif self.arch_type == 'llava':
+            self.IMG_CONTEXT_TOKEN = '<image>'
+            self.IMG_START_TOKEN = ''
+            self.IMG_END_TOKEN = ''
+        if preprocessor is None:
+            self.transformer = T.Compose([
+                T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+            ])
+            self.preprocessor = None
+        else:
+            self.transformer = None
+            self.preprocessor = BUILDER.build(preprocessor)
+        self.data = self._load_annotations(data_path, image_folder)
+        self._max_refetch = 1000
+        self.skip_pure_text = skip_pure_text
+    def _load_annotations(self, data_path, image_folder=None):
+        data = json.load(open(data_path))
+        return data
+    def __getitem__(self, index):
+        for _ in range(self._max_refetch + 1):
+            data = self.prepare_data(index)
+            # Broken images may cause the returned data to be None
+            if data is None:
+                index = self._rand_another()
+                continue
+            return data
+    def __len__(self):
+        return len(self.data)
+    @property
+    def modality_length(self):
+        self.group_length = []
+        for data_dict in self.data:
+            self.group_length.append(100)
+        return self.group_length
+    @property
+    def length(self):
+        group_length = np.array(self.group_length)
+        group_length = np.abs(group_length).tolist()
+        return group_length
+    def prepare_data(self, index):
+        data_dict: dict = self.data[index]
+        if data_dict is None:
+            return None
+        out_data_dict = {}
+        if self.skip_pure_text and data_dict.get('image', None) is None:
+            return None
+        if data_dict.get('image', None) is not None:
+            image_file = os.path.join(self.image_folder, data_dict['image'])
+            try:
+                image = Image.open(image_file).convert('RGB')
+            except Exception as e:
+                print(f'Error: {e}', flush=True)
+                print_log(f'Error: {e}', logger='current')
+                return None
+            if self.preprocessor is not None:
+                # images = dynamic_preprocess(image, self.min_dynamic_patch,
+                #                             self.max_dynamic_patch,
+                #                             self.image_size, self.use_thumbnail)
+                images = [image]
+                if self.arch_type == 'qwen':
+                    _data_dict = self.preprocessor(images, do_resize=True)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    _data_dict['image_grid_thw'] = torch.tensor(_data_dict['image_grid_thw'], dtype=torch.int)
+                    num_image_tokens = int(_data_dict['image_grid_thw'][0].prod() * (self.downsample_ratio ** 2))
+                elif self.arch_type == 'llava':
+                    _data_dict = self.preprocessor(images, do_resize=True, size=(self.image_size, self.image_size))
+                    _data_dict['pixel_values'] = np.stack(_data_dict['pixel_values'], axis=0)
+                    _data_dict['pixel_values'] = torch.tensor(_data_dict['pixel_values'], dtype=torch.float)
+                    num_image_tokens = _data_dict['pixel_values'].shape[0] * self.patch_token
+                else:
+                    raise NotImplementedError
+                out_data_dict.update(_data_dict)
+            else:
+                images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                            self.max_dynamic_patch,
+                                            self.image_size, self.use_thumbnail)
+                pixel_values = [self.transformer(image) for image in images]
+                pixel_values = torch.stack(pixel_values)
+                out_data_dict['pixel_values'] = pixel_values
+                num_image_tokens = pixel_values.shape[0] * self.patch_token
+            image_token_str = f'{self.IMG_START_TOKEN}' \
+                              f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                              f'{self.IMG_END_TOKEN}'
+            token_dict = self.get_inputid_labels(
+                data_dict['conversations'], image_token_str)
+            out_data_dict.update(token_dict)
+        else:
+            token_dict = self.get_inputid_labels(
+                data_dict['conversations'], None)
+            out_data_dict.update(token_dict)
+            out_data_dict['pixel_values'] = torch.zeros(
+                1, 3, self.image_size, self.image_size)
+        return out_data_dict
+    def _rand_another(self) -> int:
+        return np.random.randint(0, len(self.data))
+    def get_inputid_labels(self, conversations, image_token_str) -> dict:
+        input = ''
+        out_conversation = []
+        while conversations and conversations[0]['from'] == 'gpt':
+            # Skip the first one if it is from gpt
+            conversations = conversations[1:]
+        for msg in conversations:
+            if msg['from'] == 'human':
+                if image_token_str is None and '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', '')
+                if '<image>' in msg['value']:
+                    msg['value'] = msg['value'].replace('<image>', image_token_str).strip()
+                input += msg['value'].strip()
+            elif msg['from'] == 'gpt':
+                out_conversation.append({
+                    'input': input,
+                    'output': msg['value'].strip()
+                })
+                input = ''
+            else:
+                raise NotImplementedError
+        input_ids, labels = [], []
+        for i, single_turn_conversation in enumerate(out_conversation):
+            input = single_turn_conversation.get('input', '')
+            if input is None:
+                input = ''
+            input_text = self.template.INSTRUCTION.format(
+                input=input, round=i + 1)
+            if i == 0:
+                if self._system != '' and self._system is not None:
+                    system = self.template.SYSTEM.format(system=self._system)
+                    input_text = system + input_text
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=True)
+            else:
+                input_encode = self.tokenizer.encode(
+                    input_text, add_special_tokens=False)
+            input_ids += input_encode
+            labels += [IGNORE_INDEX] * len(input_encode)
+            output_text = single_turn_conversation.get('output', '')
+            if self.template.get('SUFFIX', None):
+                output_text += self.template.SUFFIX
+            output_encode = self.tokenizer.encode(
+                output_text, add_special_tokens=False)
+            input_ids += output_encode
+            labels += copy.deepcopy(output_encode)
+        if len(input_ids) > self.max_length:
+            input_ids = input_ids[:self.max_length]
+            labels = labels[:self.max_length]
+            print_log(
+                f'Warning: input_ids length({len(input_ids)}) '
+                f'is longer than max_length, cut to {self.max_length}',
+                logger='current')
+        return {'input_ids': input_ids, 'labels': labels}
+if __name__ == '__main__':
+    from transformers import CLIPImageProcessor, AutoTokenizer
+    from third_parts.segment_anything.utils.transforms import ResizeLongestSide
+    pretrained_model = 'MBZUAI/GLaMM-GranD-Pretrained'
+    llm_name_or_path = 'lmsys/vicuna-7b-v1.5'
+    tokenizer = dict(
+        type=AutoTokenizer.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path)
+    image_processor = dict(
+        type=CLIPImageProcessor.from_pretrained,
+        pretrained_model_name_or_path='openai/clip-vit-large-patch14-336')
+    extra_image_processor = dict(
+        type=ResizeLongestSide,
+        target_length=1024,
+    )
+    from xtuner.utils.templates import PROMPT_TEMPLATE
+    prompt_template = PROMPT_TEMPLATE.vicuna
+    from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory, template_map_fn
+    from projects.glamm.datasets.collate_fns.glamm_collate_fn import glamm_collate_fn
+    dataset = LLaVADataset(
+        tokenizer=tokenizer,
+        data_path='data/llava_data/LLaVA-Instruct-150K/llava_instruct_150k.json',
+        prompt_template=prompt_template,
+        special_tokens=['[SEG]'],
+        image_folder='data/coco/train2017/',
+    )
+    for i in range(1000):
+        dataset[i]

projects/llava_sam2/deepspeed_zero2_sam2.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "gradient_accumulation_steps": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_optimization": {
+    "stage": 2,
+    "overlap_comm": true,
+    "allgather_bucket_size": 5368709120,
+    "reduce_bucket_size": 5368709120,
+    "reduce_scatter": true,
+    "sub_group_size": 1e9,
+    "contiguous_gradients": true,
+    "allgather_partitions": true
+  },
+  "fp16": {
+    "enabled": false,
+    "initial_scale_power": 16
+  },
+  "bf16": {
+    "enabled": true
+  }
+}

projects/llava_sam2/gradio/app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import gradio as gr
+import sys
+from projects.llava_sam2.gradio.app_utils import\
+    process_markdown, show_mask_pred, description, preprocess_video,\
+    show_mask_pred_video, image2video_and_save
+import torch
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel, GenerationConfig)
+import argparse
+import os
+TORCH_DTYPE_MAP = dict(
+    fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto')
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="Sa2VA Demo")
+    parser.add_argument('hf_path', help='Sa2VA hf path.')
+    return parser.parse_args(args)
+def inference(image, video, follow_up, input_str):
+    input_image = image
+    if image is not None and (video is not None and os.path.exists(video)):
+        return image, video, "Error: Please only input a image or a video !!!"
+    if image is None and (video is None or not os.path.exists(video)) and not follow_up:
+        return image, video, "Error: Please input a image or a video !!!"
+    if not follow_up:
+        # reset
+        print('Log: History responses have been removed!')
+        global_infos.n_turn = 0
+        global_infos.inputs = ''
+        text = input_str
+        image = input_image
+        global_infos.image_for_show = image
+        global_infos.image = image
+        video = video
+        global_infos.video = video
+        if image is not None:
+            global_infos.input_type = "image"
+        else:
+            global_infos.input_type = "video"
+    else:
+        text = input_str
+        image = global_infos.image
+        video = global_infos.video
+    input_type = global_infos.input_type
+    if input_type == "video":
+        video = preprocess_video(video, global_infos.inputs+input_str)
+    past_text = global_infos.inputs
+    if past_text == "" and "<image>" not in text:
+        text = "<image>" + text
+    if input_type == "image":
+        input_dict = {
+            'image': image,
+            'text': text,
+            'past_text': past_text,
+            'mask_prompts': None,
+            'tokenizer': tokenizer,
+        }
+    else:
+        input_dict = {
+            'video': video,
+            'text': text,
+            'past_text': past_text,
+            'mask_prompts': None,
+            'tokenizer': tokenizer,
+        }
+    return_dict = sa2va_model.predict_forward(**input_dict)
+    global_infos.inputs = return_dict["past_text"]
+    print(return_dict['past_text'])
+    if 'prediction_masks' in return_dict.keys() and return_dict['prediction_masks'] and len(
+            return_dict['prediction_masks']) != 0:
+        if input_type == "image":
+            image_mask_show, selected_colors = show_mask_pred(global_infos.image_for_show, return_dict['prediction_masks'],)
+            video_mask_show = global_infos.video
+        else:
+            image_mask_show = None
+            video_mask_show, selected_colors = show_mask_pred_video(video, return_dict['prediction_masks'],)
+            video_mask_show = image2video_and_save(video_mask_show, save_path="./ret_video.mp4")
+    else:
+        image_mask_show = global_infos.image_for_show
+        video_mask_show = global_infos.video
+        selected_colors = []
+    predict = return_dict['prediction'].strip()
+    global_infos.n_turn += 1
+    predict = process_markdown(predict, selected_colors)
+    return image_mask_show, video_mask_show, predict
+def init_models(args):
+    model_path = args.hf_path
+    model = AutoModel.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+    )
+    return model, tokenizer
+class global_infos:
+    inputs = ''
+    n_turn = 0
+    image_width = 0
+    image_height = 0
+    image_for_show = None
+    image = None
+    video = None
+    input_type = "image" # "image" or "video"
+if __name__ == "__main__":
+    # get parse args and set models
+    args = parse_args(sys.argv[1:])
+    sa2va_model, tokenizer = \
+        init_models(args)
+    demo = gr.Interface(
+        inference,
+        inputs=[
+            gr.Image(type="pil", label="Upload Image", height=360),
+            gr.Video(sources=["upload", "webcam"], label="Upload mp4 video", height=360),
+            gr.Checkbox(label="Follow up Question"),
+            gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),],
+        outputs=[
+            gr.Image(type="pil", label="Output Image"),
+            gr.Video(label="Output Video", show_download_button=True, format='mp4'),
+            gr.Markdown()],
+        theme=gr.themes.Soft(), allow_flagging="auto", description=description,
+        title='Sa2VA'
+    )
+    demo.queue()
+    demo.launch(share=True)

projects/llava_sam2/gradio/app_utils.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import numpy as np
+from PIL import Image
+import cv2
+markdown_default = """
+<link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
+<style>
+        .highlighted-text {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 600;
+            font-size: 14px;
+            color: rgb(255, 255, 239);
+            background-color: rgb(225, 231, 254);
+            border-radius: 7px;
+            padding: 5px 7px;
+            display: inline-block;
+        }
+        .regular-text {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 400;
+            font-size: 14px;
+        }
+        .highlighted-response {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 600;
+            font-size: 14px;
+            border-radius: 6px;
+            padding: 3px 4px;
+            display: inline-block;
+        }
+</style>
+<span class="highlighted-text" style='color:rgb(107, 100, 239)'>Sa2VA</span>
+"""
+description = """
+**Usage** : <br>
+&ensp;(1) For **Grounded Caption Generation** Interleaved Segmentation, input prompt like: *"Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer."* <br>
+&ensp;(2) For **Segmentation Output**, input prompt like: *"Can you please segment xxx in the given image"* <br>
+&ensp;(3) For **Image Captioning** VQA, input prompt like: *"Could you please give me a detailed description of the image?"* <br>
+&ensp;(4) For **Image Conversation**, input arbitrary text instruction. <br>
+"""
+ONE_THIRD = 1.0/3.0
+ONE_SIXTH = 1.0/6.0
+TWO_THIRD = 2.0/3.0
+def desaturate(rgb, factor=0.65):
+    """
+    Desaturate an RGB color by a given factor.
+    :param rgb: A tuple of (r, g, b) where each value is in [0, 255].
+    :param factor: The factor by which to reduce the saturation.
+                   0 means completely desaturated, 1 means original color.
+    :return: A tuple of desaturated (r, g, b) values in [0, 255].
+    """
+    r, g, b = [x / 255.0 for x in rgb]
+    h, l, s = rgb_to_hls(r, g, b)
+    l = factor
+    new_r, new_g, new_b = hls_to_rgb(h, l, s)
+    return (int(new_r * 255), int(new_g * 255), int(new_b * 255))
+def rgb_to_hls(r, g, b):
+    maxc = max(r, g, b)
+    minc = min(r, g, b)
+    sumc = (maxc+minc)
+    rangec = (maxc-minc)
+    l = sumc/2.0
+    if minc == maxc:
+        return 0.0, l, 0.0
+    if l <= 0.5:
+        s = rangec / sumc
+    else:
+        s = rangec / (2.0-sumc)
+    rc = (maxc-r) / rangec
+    gc = (maxc-g) / rangec
+    bc = (maxc-b) / rangec
+    if r == maxc:
+        h = bc-gc
+    elif g == maxc:
+        h = 2.0+rc-bc
+    else:
+        h = 4.0+gc-rc
+    h = (h/6.0) % 1.0
+    return h, l, s
+def hls_to_rgb(h, l, s):
+    if s == 0.0:
+        return l, l, l
+    if l <= 0.5:
+        m2 = l * (1.0+s)
+    else:
+        m2 = l+s-(l*s)
+    m1 = 2.0*l - m2
+    return (_v(m1, m2, h+ONE_THIRD), _v(m1, m2, h), _v(m1, m2, h-ONE_THIRD))
+def _v(m1, m2, hue):
+    hue = hue % 1.0
+    if hue < ONE_SIXTH:
+        return m1 + (m2-m1)*hue*6.0
+    if hue < 0.5:
+        return m2
+    if hue < TWO_THIRD:
+        return m1 + (m2-m1)*(TWO_THIRD-hue)*6.0
+    return m1
+def process_markdown(output_str, colors):
+    output_str = output_str.replace("\n", "").replace("  ", " ").replace("<s>", "")\
+        .replace("<|im_end|>", '').replace("<|end|>", "")
+    output_str = output_str.split("ASSISTANT: ")[-1]
+    # markdown_out = output_str.replace('[SEG]', '')
+    markdown_out = output_str
+    markdown_out = markdown_out.replace(
+        "<p>", "<span class='highlighted-response' style='background-color:rgb[COLOR]'>"
+    )
+    markdown_out = markdown_out.replace("</p>", "</span>")
+    for color in colors:
+        markdown_out = markdown_out.replace("[COLOR]", str(desaturate(tuple(color))), 1)
+    markdown_out = f"""
+    {markdown_out}
+    """
+    markdown_out = markdown_default + "<p><span class='regular-text'>" + markdown_out
+    return markdown_out
+def show_mask_pred(image, masks):
+    masks = [mask[:1] for mask in masks]
+    masks = np.concatenate(masks, axis=0)  # (n, h, w)
+    selected_colors = []
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
+              (255, 255, 0), (255, 0, 255), (0, 255, 255),
+              (128, 128, 255), [255, 192, 203],  # Pink
+              [165, 42, 42],    # Brown
+              [255, 165, 0],    # Orange
+              [128, 0, 128],     # Purple
+              [0, 0, 128],       # Navy
+              [128, 0, 0],      # Maroon
+              [128, 128, 0],    # Olive
+              [70, 130, 180],   # Steel Blue
+              [173, 216, 230],  # Light Blue
+              [255, 192, 0],    # Gold
+              [255, 165, 165],  # Light Salmon
+              [255, 20, 147],   # Deep Pink
+              ]
+    _mask_image = np.zeros((masks.shape[1], masks.shape[2], 3), dtype=np.uint8)
+    for i, mask in enumerate(masks):
+        color = colors[i % len(colors)]
+        selected_colors.append(color)
+        _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
+        _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
+        _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
+    image = np.array(image)
+    image = image * 0.5 + _mask_image * 0.5
+    image = image.astype(np.uint8)
+    return image, selected_colors
+def show_mask_pred_video(video, masks):
+    ret_video = []
+    selected_colors = []
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
+              (255, 255, 0), (255, 0, 255), (0, 255, 255),
+              (128, 128, 255), [255, 192, 203],  # Pink
+              [165, 42, 42],  # Brown
+              [255, 165, 0],  # Orange
+              [128, 0, 128],  # Purple
+              [0, 0, 128],  # Navy
+              [128, 0, 0],  # Maroon
+              [128, 128, 0],  # Olive
+              [70, 130, 180],  # Steel Blue
+              [173, 216, 230],  # Light Blue
+              [255, 192, 0],  # Gold
+              [255, 165, 165],  # Light Salmon
+              [255, 20, 147],  # Deep Pink
+              ]
+    for i_frame in range(len(video)):
+        frame_masks = [mask[i_frame:i_frame+1] for mask in masks]
+        frame_masks = np.concatenate(frame_masks, axis=0)
+        _mask_image = np.zeros((frame_masks.shape[1], frame_masks.shape[2], 3), dtype=np.uint8)
+        for i, mask in enumerate(frame_masks):
+            if i_frame == 0:
+                color = colors[i % len(colors)]
+                selected_colors.append(color)
+            else:
+                color = selected_colors[i]
+            _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
+            _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
+            _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
+        image = np.array(video[i_frame])
+        image = image * 0.5 + _mask_image * 0.5
+        image = image.astype(np.uint8)
+        ret_video.append(image)
+    return ret_video, selected_colors
+def parse_visual_prompts(points):
+    ret = {'points': [], 'boxes': []}
+    for item in points:
+        if item[2] == 1.0:
+            ret['points'].append([item[0], item[1]])
+        elif item[2] == 2.0 or item[2] == 3.0:
+            ret['boxes'].append([item[0], item[1], item[3], item[4]])
+        else:
+            raise NotImplementedError
+    return ret
+def get_video_frames(video_path):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Error: Cannot open video file.")
+        return
+    frames = []
+    frame_id = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+        frame_id += 1
+    cap.release()
+    return frames
+def get_frames_from_video(video_path, n_frames=5, sample_type="uniform"):
+    frames = get_video_frames(video_path)
+    if sample_type == "uniform":
+        stride = len(frames) / (n_frames + 1e-4)
+        ret = []
+        for i in range(n_frames):
+            idx = int(i * stride)
+            frame = frames[idx]
+            frame = frame[:, :, ::-1]
+            frame_image = Image.fromarray(frame).convert('RGB')
+            ret.append(frame_image)
+    else:
+        ret = []
+        for frame in frames[:500]:
+            frame = frame[:, :, ::-1]
+            frame_image = Image.fromarray(frame).convert('RGB')
+            ret.append(frame_image)
+    return ret
+def preprocess_video(video_path, text):
+    if "Segment" in text or "segment" in text:
+        sample_type = 'begin'
+    else:
+        sample_type = 'uniform'
+    return get_frames_from_video(video_path, sample_type=sample_type)
+def image2video_and_save(frames, save_path):
+    success = frames_to_video(frames, save_path)
+    return save_path
+def frames_to_video(
+        frames,
+        output_path: str,
+        fps: int = 24,
+) -> bool:
+    try:
+        frames = [frame[:, :, ::-1] for frame in frames]
+        # Use provided frame size or get from first frame
+        height, width = frames[0].shape[:2]
+        # Initialize video writer
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        # Process each frame
+        for frame in frames:
+            out.write(frame)
+        # Release video writer
+        out.release()
+        print(f"Video saved successfully to {output_path}")
+        return True
+    except Exception as e:
+        print(f"Error converting frames to video: {str(e)}")
+        return False

projects/llava_sam2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .llava_sam2 import VideoLLaVASAMModel, VideoLLaVASAMModel_zero3
+from .sam2 import SAM2
+from .sam2_train import SAM2TrainRunner

projects/llava_sam2/models/extension/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .sam2_base import SAM2Base

projects/llava_sam2/models/extension/sam2_base.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import torch
+import torch.nn.functional as F
+from third_parts.sam2.modeling.sam2_base import SAM2Base as _SAM2Base
+from third_parts.sam2.modeling.sam2_base import NO_OBJ_SCORE
+class SAM2Base(_SAM2Base):
+    def track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        # Whether to run the memory encoder on the predicted masks. Sometimes we might want
+        # to skip the memory encoder with `run_mem_encoder=False`. For example,
+        # in demo we might call `track_step` multiple times for each user click,
+        # and only encode the memory when the user finalizes their clicks. And in ablation
+        # settings like SAM training on static images, we don't need the memory encoder.
+        run_mem_encoder=True,
+        # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
+        prev_sam_mask_logits=None,
+        ## Extension: LLM prompt
+        language_embd=None,
+    ):
+        current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
+        # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
+        if len(current_vision_feats) > 1:
+            high_res_features = [
+                x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
+                for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
+            ]
+        else:
+            high_res_features = None
+        if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
+            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
+            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+            pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+            sam_outputs = self._use_mask_as_output(
+                pix_feat, high_res_features, mask_inputs
+            )
+        else:
+            # fused the visual feature with previous memory features in the memory bank
+            pix_feat_with_mem = self._prepare_memory_conditioned_features(
+                frame_idx=frame_idx,
+                is_init_cond_frame=is_init_cond_frame,
+                current_vision_feats=current_vision_feats[-1:],
+                current_vision_pos_embeds=current_vision_pos_embeds[-1:],
+                feat_sizes=feat_sizes[-1:],
+                output_dict=output_dict,
+                num_frames=num_frames,
+                track_in_reverse=track_in_reverse,
+            )
+            # apply SAM-style segmentation head
+            # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
+            # e.g. in demo where such logits come from earlier interaction instead of correction sampling
+            # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
+            if prev_sam_mask_logits is not None:
+                assert point_inputs is not None and mask_inputs is None
+                mask_inputs = prev_sam_mask_logits
+            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            sam_outputs = self._forward_sam_heads(
+                backbone_features=pix_feat_with_mem,
+                point_inputs=point_inputs,
+                mask_inputs=mask_inputs,
+                high_res_features=high_res_features,
+                multimask_output=multimask_output,
+                # Inject language Embed if possible
+                language_embd=language_embd,
+            )
+        (
+            _,
+            _,
+            _,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            _,
+        ) = sam_outputs
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        if run_mem_encoder and self.num_maskmem > 0:
+            high_res_masks_for_mem_enc = high_res_masks
+            maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+                current_vision_feats=current_vision_feats,
+                feat_sizes=feat_sizes,
+                pred_masks_high_res=high_res_masks_for_mem_enc,
+                is_mask_from_pts=(point_inputs is not None),
+            )
+            current_out["maskmem_features"] = maskmem_features
+            current_out["maskmem_pos_enc"] = maskmem_pos_enc
+        else:
+            current_out["maskmem_features"] = None
+            current_out["maskmem_pos_enc"] = None
+        return current_out
+    def _forward_sam_heads(
+        self,
+        backbone_features,
+        point_inputs=None,
+        mask_inputs=None,
+        high_res_features=None,
+        multimask_output=False,
+        ## Extension: LLM prompt
+        language_embd=None,
+    ):
+        """
+        Forward SAM prompt encoders and mask heads.
+        Inputs:
+        - backbone_features: image features of [B, C, H, W] shape
+        - point_inputs: a dictionary with "point_coords" and "point_labels", where
+          1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
+             absolute pixel-unit coordinate in (x, y) format of the P input points
+          2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
+             positive clicks, 0 means negative clicks, and -1 means padding
+        - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the
+          same spatial size as the image.
+        - high_res_features: either 1) None or 2) or a list of length 2 containing
+          two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively,
+          which will be used as high-resolution feature maps for SAM decoder.
+        - multimask_output: if it's True, we output 3 candidate masks and their 3
+          corresponding IoU estimates, and if it's False, we output only 1 mask and
+          its corresponding IoU estimate.
+        Outputs:
+        - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if
+          `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
+          output mask logits (before sigmoid) for the low-resolution masks, with 4x
+          the resolution (1/4 stride) of the input backbone_features.
+        - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3
+          if `multimask_output=True` and M = 1 if `multimask_output=False`),
+          upsampled from the low-resolution masks, with shape size as the image
+          (stride is 1 pixel).
+        - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
+          if `multimask_output=False`), the estimated IoU of each output mask.
+        - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `low_res_multimasks`.
+        - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `high_res_multimasks`.
+        - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
+          based on the output token from the SAM mask decoder.
+        """
+        B = backbone_features.size(0)
+        device = backbone_features.device
+        assert backbone_features.size(1) == self.sam_prompt_embed_dim
+        assert backbone_features.size(2) == self.sam_image_embedding_size
+        assert backbone_features.size(3) == self.sam_image_embedding_size
+        # a) Handle point prompts
+        if point_inputs is not None:
+            sam_point_coords = point_inputs["point_coords"]
+            sam_point_labels = point_inputs["point_labels"]
+            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+        else:
+            # If no points are provide, pad with an empty point (with label -1)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+        # b) Handle mask prompts
+        if mask_inputs is not None:
+            # If mask_inputs is provided, downsize it into low-res mask input if needed
+            # and feed it as a dense mask prompt into the SAM mask encoder
+            assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+            if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+                sam_mask_prompt = F.interpolate(
+                    mask_inputs.float(),
+                    size=self.sam_prompt_encoder.mask_input_size,
+                    align_corners=False,
+                    mode="bilinear",
+                    antialias=True,  # use antialias for downsampling
+                )
+            else:
+                sam_mask_prompt = mask_inputs
+        else:
+            # Otherwise, simply feed None (and SAM's prompt encoder will add
+            # a learned `no_mask_embed` to indicate no mask input in this case).
+            sam_mask_prompt = None
+        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+            points=(sam_point_coords, sam_point_labels),
+            boxes=None,
+            masks=sam_mask_prompt,
+        )
+        ## Extension: LLM prompt
+        if language_embd is not None:
+            # B N C
+            assert sparse_embeddings.size(0) == language_embd.size(0)
+            assert sparse_embeddings.size(2) == language_embd.size(2)
+            sparse_embeddings = torch.cat([sparse_embeddings, language_embd], dim=1)
+        (
+            low_res_multimasks,
+            ious,
+            sam_output_tokens,
+            object_score_logits,
+        ) = self.sam_mask_decoder(
+            image_embeddings=backbone_features,
+            image_pe=self.sam_prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=False,  # the image is already batched
+            high_res_features=high_res_features,
+        )
+        if self.pred_obj_scores:
+            is_obj_appearing = object_score_logits > 0
+            # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+            # consistent with the actual mask prediction
+            # print('Do torch.where !!!')
+            # low_res_multimasks = torch.where(
+            #     is_obj_appearing[:, None, None],
+            #     low_res_multimasks,
+            #     NO_OBJ_SCORE,
+            # )
+        # convert masks from possibly bfloat16 (or float16) to float32
+        # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+        low_res_multimasks = low_res_multimasks.float()
+        high_res_multimasks = F.interpolate(
+            low_res_multimasks,
+            size=(self.image_size, self.image_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        sam_output_token = sam_output_tokens[:, 0]
+        if multimask_output:
+            # take the best mask prediction (with the highest IoU estimation)
+            best_iou_inds = torch.argmax(ious, dim=-1)
+            batch_inds = torch.arange(B, device=device)
+            low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            if sam_output_tokens.size(1) > 1:
+                sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+        else:
+            low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+        # Extract object pointer from the SAM output token (with occlusion handling)
+        obj_ptr = self.obj_ptr_proj(sam_output_token)
+        if self.pred_obj_scores:
+            # Allow *soft* no obj ptr, unlike for masks
+            if self.soft_no_obj_ptr:
+                # Only hard possible with gt
+                assert not self.teacher_force_obj_scores_for_mem
+                lambda_is_obj_appearing = object_score_logits.sigmoid()
+            else:
+                lambda_is_obj_appearing = is_obj_appearing.float()
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+        return (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )