evf-sam2

Running on Zero

App Files Files Community

wondervictor commited on Jul 31

Commit

a93afca

•

1 Parent(s): 2ca88bd

add app

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
LICENSE +201 -0
README.md +1 -1
app.py +80 -0
inference.py +189 -0
model/EfficientSAM/.DS_Store +0 -0
model/EfficientSAM/efficient_sam/__init__.py +7 -0
model/EfficientSAM/efficient_sam/build_efficient_sam.py +22 -0
model/EfficientSAM/efficient_sam/efficient_sam.py +306 -0
model/EfficientSAM/efficient_sam/efficient_sam_decoder.py +318 -0
model/EfficientSAM/efficient_sam/efficient_sam_encoder.py +257 -0
model/EfficientSAM/efficient_sam/mlp.py +29 -0
model/EfficientSAM/efficient_sam/two_way_transformer.py +266 -0
model/configuration_evf.py +113 -0
model/evf_effisam.py +313 -0
model/evf_sam.py +303 -0
model/segment_anything/__init__.py +10 -0
model/segment_anything/automatic_mask_generator.py +372 -0
model/segment_anything/build_sam.py +108 -0
model/segment_anything/modeling/__init__.py +11 -0
model/segment_anything/modeling/common.py +43 -0
model/segment_anything/modeling/image_encoder.py +426 -0
model/segment_anything/modeling/mask_decoder.py +191 -0
model/segment_anything/modeling/prompt_encoder.py +238 -0
model/segment_anything/modeling/sam.py +184 -0
model/segment_anything/modeling/transformer.py +242 -0
model/segment_anything/predictor.py +284 -0
model/segment_anything/utils/__init__.py +5 -0
model/segment_anything/utils/amg.py +346 -0
model/segment_anything/utils/onnx.py +157 -0
model/segment_anything/utils/transforms.py +113 -0
model/unilm/beit3/README.md +191 -0
model/unilm/beit3/datasets.py +847 -0
model/unilm/beit3/engine_for_finetuning.py +598 -0
model/unilm/beit3/get_started/get_started_for_captioning.md +176 -0
model/unilm/beit3/get_started/get_started_for_image_classification.md +138 -0
model/unilm/beit3/get_started/get_started_for_nlvr2.md +136 -0
model/unilm/beit3/get_started/get_started_for_retrieval.md +161 -0
model/unilm/beit3/get_started/get_started_for_vqav2.md +144 -0
model/unilm/beit3/glossary.py +190 -0
model/unilm/beit3/modeling_finetune.py +386 -0
model/unilm/beit3/modeling_utils.py +76 -0
model/unilm/beit3/optim_factory.py +128 -0
model/unilm/beit3/randaug.py +340 -0
model/unilm/beit3/requirements.txt +22 -0
model/unilm/beit3/run_beit3_finetuning.py +448 -0
model/unilm/beit3/utils.py +913 -0
requirements.txt +32 -0
utils/ade20k_classes.json +30 -0
utils/aug.py +117 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Evf Sam
 emoji: 👀
 colorFrom: yellow
 colorTo: gray

 ---
+title: EVF-SAM
 emoji: 👀
 colorFrom: yellow
 colorTo: gray

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+from inference import sam_preprocess, beit3_preprocess
+from model.evf_sam import EvfSamModel
+from transformers import AutoTokenizer
+import torch
+import numpy as np
+import sys
+version = "YxZhang/evf-sam"
+model_type = "ori"
+tokenizer = AutoTokenizer.from_pretrained(
+    version,
+    padding_side="right",
+    use_fast=False,
+)
+kwargs = {
+    "torch_dtype": torch.half,
+}
+model = EvfSamModel.from_pretrained(version, low_cpu_mem_usage=True,
+                                    **kwargs).cuda().eval()
+@torch.no_grad()
+def pred(image_np, prompt):
+    original_size_list = [image_np.shape[:2]]
+    image_beit = beit3_preprocess(image_np, 224).to(dtype=model.dtype,
+                                                    device=model.device)
+    image_sam, resize_shape = sam_preprocess(image_np, model_type=model_type)
+    image_sam = image_sam.to(dtype=model.dtype, device=model.device)
+    input_ids = tokenizer(
+        prompt, return_tensors="pt")["input_ids"].to(device=model.device)
+    # infer
+    pred_mask = model.inference(
+        image_sam.unsqueeze(0),
+        image_beit.unsqueeze(0),
+        input_ids,
+        resize_list=[resize_shape],
+        original_size_list=original_size_list,
+    )
+    pred_mask = pred_mask.detach().cpu().numpy()[0]
+    pred_mask = pred_mask > 0
+    visualization = image_np.copy()
+    visualization[pred_mask] = (image_np * 0.5 +
+                                pred_mask[:, :, None].astype(np.uint8) *
+                                np.array([50, 120, 220]) * 0.5)[pred_mask]
+    return visualization / 255.0, pred_mask.astype(np.float16)
+demo = gr.Interface(
+    fn=pred,
+    inputs=[
+        gr.components.Image(type="numpy", label="Image", image_mode="RGB"),
+        gr.components.Textbox(
+            label="Prompt",
+            info=
+            "Use a phrase or sentence to describe the object you want to segment. Currently we only support English"
+        )
+    ],
+    outputs=[
+        gr.components.Image(type="numpy", label="visulization"),
+        gr.components.Image(type="numpy", label="mask")
+    ],
+    examples=[["assets/zebra.jpg", "zebra top left"],
+              ["assets/bus.jpg", "bus going to south common"],
+              [
+                  "assets/carrots.jpg",
+                  "3carrots in center with ice and greenn leaves"
+              ]],
+    title="EVF-SAM referring expression segmentation",
+    allow_flagging="never")
+# demo.launch()
+demo.launch(share=False, server_name="0.0.0.0", server_port=10001)

inference.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import argparse
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, BitsAndBytesConfig
+from model.segment_anything.utils.transforms import ResizeLongestSide
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="EVF infer")
+    parser.add_argument("--version", required=True)
+    parser.add_argument("--vis_save_path", default="./infer", type=str)
+    parser.add_argument(
+        "--precision",
+        default="fp16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=224, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--model_type", default="ori", choices=["ori", "effi"])
+    parser.add_argument("--image_path", type=str, default="assets/zebra.jpg")
+    parser.add_argument("--prompt", type=str, default="zebra top left")
+    return parser.parse_args(args)
+def sam_preprocess(
+    x: np.ndarray,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+    model_type="ori") -> torch.Tensor:
+    '''
+    preprocess of Segment Anything Model, including scaling, normalization and padding.
+    preprocess differs between SAM and Effi-SAM, where Effi-SAM use no padding.
+    input: ndarray
+    output: torch.Tensor
+    '''
+    assert img_size==1024, \
+        "both SAM and Effi-SAM receive images of size 1024^2, don't change this setting unless you're sure that your employed model works well with another size."
+    x = ResizeLongestSide(img_size).apply_image(x)
+    resize_shape = x.shape[:2]
+    x = torch.from_numpy(x).permute(2,0,1).contiguous()
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    if model_type=="effi":
+        x = F.interpolate(x.unsqueeze(0), (img_size, img_size), mode="bilinear").squeeze(0)
+    else:
+        # Pad
+        h, w = x.shape[-2:]
+        padh = img_size - h
+        padw = img_size - w
+        x = F.pad(x, (0, padw, 0, padh))
+    return x, resize_shape
+def beit3_preprocess(x: np.ndarray, img_size=224) -> torch.Tensor:
+    '''
+    preprocess for BEIT-3 model.
+    input: ndarray
+    output: torch.Tensor
+    '''
+    beit_preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((img_size, img_size), interpolation=InterpolationMode.BICUBIC),
+        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+    ])
+    return beit_preprocess(x)
+def init_models(args):
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,
+        padding_side="right",
+        use_fast=False,
+    )
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "quantization_config": BitsAndBytesConfig(
+                    llm_int8_skip_modules=["visual_model"],
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                ),
+            }
+        )
+    elif args.load_in_8bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "quantization_config": BitsAndBytesConfig(
+                    llm_int8_skip_modules=["visual_model"],
+                    load_in_8bit=True,
+                ),
+            }
+        )
+    if args.model_type=="ori":
+        from model.evf_sam import EvfSamModel
+        model = EvfSamModel.from_pretrained(
+            args.version, low_cpu_mem_usage=True, **kwargs
+        )
+    elif args.model_type=="effi":
+        from model.evf_effisam import EvfEffiSamModel
+        model = EvfEffiSamModel.from_pretrained(
+            args.version, low_cpu_mem_usage=True, **kwargs
+        )
+    if (not args.load_in_4bit) and (not args.load_in_8bit):
+        model = model.cuda()
+    model.eval()
+    return tokenizer, model
+def main(args):
+    args = parse_args(args)
+    # clarify IO
+    image_path = args.image_path
+    if not os.path.exists(image_path):
+        print("File not found in {}".format(image_path))
+        exit()
+    prompt = args.prompt
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    save_path = "{}/{}_vis.png".format(
+        args.vis_save_path, os.path.basename(image_path).split(".")[0]
+    )
+    # initialize model and tokenizer
+    tokenizer, model = init_models(args)
+    # preprocess
+    image_np = cv2.imread(image_path)
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    image_beit = beit3_preprocess(image_np, args.image_size).to(dtype=model.dtype, device=model.device)
+    image_sam, resize_shape = sam_preprocess(image_np, model_type=args.model_type)
+    image_sam = image_sam.to(dtype=model.dtype, device=model.device)
+    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device=model.device)
+    # infer
+    pred_mask = model.inference(
+        image_sam.unsqueeze(0),
+        image_beit.unsqueeze(0),
+        input_ids,
+        resize_list=[resize_shape],
+        original_size_list=original_size_list,
+    )
+    pred_mask = pred_mask.detach().cpu().numpy()[0]
+    pred_mask = pred_mask > 0
+    # save visualization
+    save_img = image_np.copy()
+    save_img[pred_mask] = (
+        image_np * 0.5
+        + pred_mask[:, :, None].astype(np.uint8) * np.array([50, 120, 220]) * 0.5
+    )[pred_mask]
+    save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(save_path, save_img)
+if __name__ == "__main__":
+    main(sys.argv[1:])

model/EfficientSAM/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

model/EfficientSAM/efficient_sam/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from .build_efficient_sam import (
+    build_efficient_sam_vitt,
+    build_efficient_sam_vits,
+)

model/EfficientSAM/efficient_sam/build_efficient_sam.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .efficient_sam import build_efficient_sam
+def build_efficient_sam_vitt(checkpoint=None):
+    return build_efficient_sam(
+        encoder_patch_embed_dim=192,
+        encoder_num_heads=3,
+        checkpoint=checkpoint,
+    ).eval()
+def build_efficient_sam_vits(checkpoint=None):
+    return build_efficient_sam(
+        encoder_patch_embed_dim=384,
+        encoder_num_heads=6,
+        checkpoint=checkpoint,
+    ).eval()

model/EfficientSAM/efficient_sam/efficient_sam.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, List, Tuple, Type
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .efficient_sam_decoder import MaskDecoder, PromptEncoder
+from .efficient_sam_encoder import ImageEncoderViT
+from .two_way_transformer import TwoWayAttentionBlock, TwoWayTransformer
+class EfficientSam(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        decoder_max_num_input_points: int,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = [0.485, 0.456, 0.406],
+        pixel_std: List[float] = [0.229, 0.224, 0.225],
+    ) -> None:
+        """
+        SAM predicts object masks from an image and input prompts.
+        Arguments:
+          image_encoder (ImageEncoderViT): The backbone used to encode the
+            image into image embeddings that allow for efficient mask prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+        """
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.prompt_encoder = prompt_encoder
+        self.decoder_max_num_input_points = decoder_max_num_input_points
+        self.mask_decoder = mask_decoder
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(pixel_mean).view(1, 3, 1, 1), False
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(pixel_std).view(1, 3, 1, 1), False
+        )
+    @torch.jit.export
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        multimask_output: bool,
+        input_h: int,
+        input_w: int,
+        output_h: int = -1,
+        output_w: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks given image embeddings and prompts. This only runs the decoder.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          batched_points: A tensor of shape [B, max_num_queries, num_pts, 2]
+          batched_point_labels: A tensor of shape [B, max_num_queries, num_pts]
+        Returns:
+          A tuple of two tensors:
+            low_res_mask: A tensor of shape [B, max_num_queries, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, max_num_queries, num_pts, _ = batched_points.shape
+        num_pts = batched_points.shape[2]
+        rescaled_batched_points = self.get_rescaled_pts(batched_points, input_h, input_w)
+        if num_pts > self.decoder_max_num_input_points:
+            rescaled_batched_points = rescaled_batched_points[
+                :, :, : self.decoder_max_num_input_points, :
+            ]
+            batched_point_labels = batched_point_labels[
+                :, :, : self.decoder_max_num_input_points
+            ]
+        elif num_pts < self.decoder_max_num_input_points:
+            rescaled_batched_points = F.pad(
+                rescaled_batched_points,
+                (0, 0, 0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+            batched_point_labels = F.pad(
+                batched_point_labels,
+                (0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+        sparse_embeddings = self.prompt_encoder(
+            rescaled_batched_points.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points, 2
+            ),
+            batched_point_labels.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points
+            ),
+        )
+        sparse_embeddings = sparse_embeddings.view(
+            batch_size,
+            max_num_queries,
+            sparse_embeddings.shape[1],
+            sparse_embeddings.shape[2],
+        )
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings,
+            self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            multimask_output=multimask_output,
+        )
+        _, num_predictions, low_res_size, _ = low_res_masks.shape
+        if output_w > 0 and output_h > 0:
+            output_masks = F.interpolate(
+                low_res_masks, (output_h, output_w), mode="bicubic"
+            )
+            output_masks = torch.reshape(
+                output_masks,
+                (batch_size, max_num_queries, num_predictions, output_h, output_w),
+            )
+        else:
+            output_masks = torch.reshape(
+                low_res_masks,
+                (
+                    batch_size,
+                    max_num_queries,
+                    num_predictions,
+                    low_res_size,
+                    low_res_size,
+                ),
+            )
+        iou_predictions = torch.reshape(
+            iou_predictions, (batch_size, max_num_queries, num_predictions)
+        )
+        return output_masks, iou_predictions
+    def get_rescaled_pts(self, batched_points: torch.Tensor, input_h: int, input_w: int):
+        return torch.stack(
+            [
+                torch.where(
+                    batched_points[..., 0] >= 0,
+                    batched_points[..., 0] * self.image_encoder.img_size / input_w,
+                    -1.0,
+                ),
+                torch.where(
+                    batched_points[..., 1] >= 0,
+                    batched_points[..., 1] * self.image_encoder.img_size / input_h,
+                    -1.0,
+                ),
+            ],
+            dim=-1,
+        )
+    @torch.jit.export
+    def get_image_embeddings(self, batched_images) -> torch.Tensor:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+        Returns:
+          List of image embeddings each of of shape [B, C(i), H(i), W(i)].
+          The last embedding corresponds to the final layer.
+        """
+        batched_images = self.preprocess(batched_images)
+        return self.image_encoder(batched_images)
+    def forward(
+        self,
+        batched_images: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        scale_to_original_image_size: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+          batched_points: A tensor of shape [B, num_queries, max_num_pts, 2]
+          batched_point_labels: A tensor of shape [B, num_queries, max_num_pts]
+        Returns:
+          A list tuples of two tensors where the ith element is by considering the first i+1 points.
+            low_res_mask: A tensor of shape [B, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, _, input_h, input_w = batched_images.shape
+        image_embeddings = self.get_image_embeddings(batched_images)
+        return self.predict_masks(
+            image_embeddings,
+            batched_points,
+            batched_point_labels,
+            multimask_output=True,
+            input_h=input_h,
+            input_w=input_w,
+            output_h=input_h if scale_to_original_image_size else -1,
+            output_w=input_w if scale_to_original_image_size else -1,
+        )
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        if (
+            x.shape[2] != self.image_encoder.img_size
+            or x.shape[3] != self.image_encoder.img_size
+        ):
+            x = F.interpolate(
+                x,
+                (self.image_encoder.img_size, self.image_encoder.img_size),
+                mode="bilinear",
+            )
+        return (x - self.pixel_mean) / self.pixel_std
+def build_efficient_sam(encoder_patch_embed_dim, encoder_num_heads, checkpoint=None):
+    img_size = 1024
+    encoder_patch_size = 16
+    encoder_depth = 12
+    encoder_mlp_ratio = 4.0
+    encoder_neck_dims = [256, 256]
+    decoder_max_num_input_points = 6
+    decoder_transformer_depth = 2
+    decoder_transformer_mlp_dim = 2048
+    decoder_num_heads = 8
+    decoder_upscaling_layer_dims = [64, 32]
+    num_multimask_outputs = 3
+    iou_head_depth = 3
+    iou_head_hidden_dim = 256
+    activation = "gelu"
+    normalization_type = "layer_norm"
+    normalize_before_activation = False
+    assert activation == "relu" or activation == "gelu"
+    if activation == "relu":
+        activation_fn = nn.ReLU
+    else:
+        activation_fn = nn.GELU
+    image_encoder = ImageEncoderViT(
+        img_size=img_size,
+        patch_size=encoder_patch_size,
+        in_chans=3,
+        patch_embed_dim=encoder_patch_embed_dim,
+        normalization_type=normalization_type,
+        depth=encoder_depth,
+        num_heads=encoder_num_heads,
+        mlp_ratio=encoder_mlp_ratio,
+        neck_dims=encoder_neck_dims,
+        act_layer=activation_fn,
+    )
+    image_embedding_size = image_encoder.image_embedding_size
+    encoder_transformer_output_dim = image_encoder.transformer_output_dim
+    sam = EfficientSam(
+        image_encoder=image_encoder,
+        prompt_encoder=PromptEncoder(
+            embed_dim=encoder_transformer_output_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(img_size, img_size),
+        ),
+        decoder_max_num_input_points=decoder_max_num_input_points,
+        mask_decoder=MaskDecoder(
+            transformer_dim=encoder_transformer_output_dim,
+            transformer=TwoWayTransformer(
+                depth=decoder_transformer_depth,
+                embedding_dim=encoder_transformer_output_dim,
+                num_heads=decoder_num_heads,
+                mlp_dim=decoder_transformer_mlp_dim,
+                activation=activation_fn,
+                normalize_before_activation=normalize_before_activation,
+            ),
+            num_multimask_outputs=num_multimask_outputs,
+            activation=activation_fn,
+            normalization_type=normalization_type,
+            normalize_before_activation=normalize_before_activation,
+            iou_head_depth=iou_head_depth - 1,
+            iou_head_hidden_dim=iou_head_hidden_dim,
+            upscaling_layer_dims=decoder_upscaling_layer_dims,
+        ),
+        pixel_mean=[0.485, 0.456, 0.406],
+        pixel_std=[0.229, 0.224, 0.225],
+    )
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        sam.load_state_dict(state_dict["model"])
+    return sam

model/EfficientSAM/efficient_sam/efficient_sam_decoder.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple, Type
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mlp import MLPBlock
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.invalid_points = nn.Embedding(1, embed_dim)
+        self.point_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_top_left_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_bottom_right_embeddings = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        invalid_label_ids = torch.eq(labels, -1)[:,:,None]
+        point_label_ids = torch.eq(labels, 1)[:,:,None]
+        topleft_label_ids = torch.eq(labels, 2)[:,:,None]
+        bottomright_label_ids = torch.eq(labels, 3)[:,:,None]
+        point_embedding = point_embedding + self.invalid_points.weight[:,None,:] * invalid_label_ids
+        point_embedding = point_embedding + self.point_embeddings.weight[:,None,:] * point_label_ids
+        point_embedding = point_embedding + self.bbox_top_left_embeddings.weight[:,None,:] * topleft_label_ids
+        point_embedding = point_embedding + self.bbox_bottom_right_embeddings.weight[:,None,:] * bottomright_label_ids
+        return point_embedding
+    def forward(
+        self,
+        coords,
+        labels,
+    ) -> torch.Tensor:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points: A tensor of shape [B, 2]
+          labels: An integer tensor of shape [B] where each element is 1,2 or 3.
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+        """
+        return self._embed_points(coords, labels)
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int) -> None:
+        super().__init__()
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix", torch.randn((2, num_pos_feats))
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones([h, w], device=device, dtype=self.positional_encoding_gaussian_matrix.dtype)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        # remove to(float) here, don't know why original implementation add this
+        return self._pe_encoding(coords)  # B x N x C
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int,
+        activation: Type[nn.Module],
+        normalization_type: str,
+        normalize_before_activation: bool,
+        iou_head_depth: int,
+        iou_head_hidden_dim: int,
+        upscaling_layer_dims: List[int],
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        if num_multimask_outputs > 1:
+            self.num_mask_tokens = num_multimask_outputs + 1
+        else:
+            self.num_mask_tokens = 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        output_dim_after_upscaling = transformer_dim
+        self.final_output_upscaling_layers = nn.ModuleList([])
+        for idx, layer_dims in enumerate(upscaling_layer_dims):
+            self.final_output_upscaling_layers.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(
+                        output_dim_after_upscaling,
+                        layer_dims,
+                        kernel_size=2,
+                        stride=2,
+                    ),
+                    nn.GroupNorm(1, layer_dims)
+                    if idx < len(upscaling_layer_dims) - 1
+                    else nn.Identity(),
+                    activation(),
+                )
+            )
+            output_dim_after_upscaling = layer_dims
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLPBlock(
+                    input_dim=transformer_dim,
+                    hidden_dim=transformer_dim,
+                    output_dim=output_dim_after_upscaling,
+                    num_layers=2,
+                    act=activation,
+                )
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLPBlock(
+            input_dim=transformer_dim,
+            hidden_dim=iou_head_hidden_dim,
+            output_dim=self.num_mask_tokens,
+            num_layers=iou_head_depth,
+            act=activation,
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings (the batch dimension is broadcastable).
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        (
+            batch_size,
+            max_num_queries,
+            sparse_embed_dim_1,
+            sparse_embed_dim_2,
+        ) = sparse_prompt_embeddings.shape
+        (
+            _,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        ) = image_embeddings.shape
+        # Tile the image embedding for all queries.
+        image_embeddings_tiled = torch.tile(
+            image_embeddings[:, None, :, :, :], [1, max_num_queries, 1, 1, 1]
+        ).view(
+            batch_size * max_num_queries,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        )
+        sparse_prompt_embeddings = sparse_prompt_embeddings.reshape(
+            batch_size * max_num_queries, sparse_embed_dim_1, sparse_embed_dim_2
+        )
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings_tiled,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+        )
+        if multimask_output and self.num_multimask_outputs > 1:
+            return masks[:, 1:, :], iou_pred[:, 1:]
+        else:
+            return masks[:, :1, :], iou_pred[:, :1]
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = image_embeddings.shape
+        hs, src = self.transformer(image_embeddings, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        upscaled_embedding = src.transpose(1, 2).view(b, c, h, w)
+        for upscaling_layer in self.final_output_upscaling_layers:
+            upscaled_embedding = upscaling_layer(upscaled_embedding)
+        hyper_in_list: List[torch.Tensor] = []
+        for i, output_hypernetworks_mlp in enumerate(self.output_hypernetworks_mlps):
+            hyper_in_list.append(output_hypernetworks_mlp(mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred

model/EfficientSAM/efficient_sam/efficient_sam_encoder.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List, Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size,
+        patch_size,
+        in_chans,
+        embed_dim,
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            bias=True,
+        )
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        qkv_bias,
+        qk_scale=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+        )
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+@torch.jit.export
+def get_abs_pos(
+    abs_pos: torch.Tensor, has_cls_token: bool, hw: List[int]
+) -> torch.Tensor:
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h = hw[0]
+    w = hw[1]
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+# Image encoder for efficient SAM.
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        patch_embed_dim: int,
+        normalization_type: str,
+        depth: int,
+        num_heads: int,
+        mlp_ratio: float,
+        neck_dims: List[int],
+        act_layer: Type[nn.Module],
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            patch_embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            act_layer (nn.Module): Activation layer.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.image_embedding_size = img_size // ((patch_size if patch_size > 0 else 1))
+        self.transformer_output_dim = ([patch_embed_dim] + neck_dims)[-1]
+        self.pretrain_use_cls_token = True
+        pretrain_img_size = 224
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, patch_embed_dim)
+        # Initialize absolute positional embedding with pretrain image size.
+        num_patches = (pretrain_img_size // patch_size) * (
+            pretrain_img_size // patch_size
+        )
+        num_positions = num_patches + 1
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, patch_embed_dim))
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            vit_block = Block(patch_embed_dim, num_heads, mlp_ratio, True)
+            self.blocks.append(vit_block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                patch_embed_dim,
+                neck_dims[0],
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+            nn.Conv2d(
+                neck_dims[0],
+                neck_dims[0],
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert (
+            x.shape[2] == self.img_size and x.shape[3] == self.img_size
+        ), "input image size must match self.img_size"
+        x = self.patch_embed(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        x = x + get_abs_pos(
+            self.pos_embed, self.pretrain_use_cls_token, [x.shape[1], x.shape[2]]
+        )
+        num_patches = x.shape[1]
+        assert x.shape[2] == num_patches
+        x = x.reshape(x.shape[0], num_patches * num_patches, x.shape[3])
+        for blk in self.blocks:
+            x = blk(x)
+        x = x.reshape(x.shape[0], num_patches, num_patches, x.shape[2])
+        x = self.neck(x.permute(0, 3, 1, 2))
+        return x

model/EfficientSAM/efficient_sam/mlp.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Type
+from torch import nn
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act: Type[nn.Module],
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Sequential(nn.Linear(n, k), act())
+            for n, k in zip([input_dim] + h, [hidden_dim] * num_layers)
+        )
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return self.fc(x)

model/EfficientSAM/efficient_sam/two_way_transformer.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import math
+from typing import Tuple, Type
+import torch
+from torch import nn, Tensor
+from .mlp import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            curr_layer = TwoWayAttentionBlock(
+                embedding_dim=embedding_dim,
+                num_heads=num_heads,
+                mlp_dim=mlp_dim,
+                activation=activation,
+                normalize_before_activation=normalize_before_activation,
+                attention_downsample_rate=attention_downsample_rate,
+                skip_first_layer_pe=(i == 0),
+            )
+            self.layers.append(curr_layer)
+        self.final_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for idx, layer in enumerate(self.layers):
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = AttentionForTwoWayAttentionBlock(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(
+            embedding_dim,
+            mlp_dim,
+            embedding_dim,
+            1,
+            activation,
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if not self.skip_first_layer_pe:
+            queries = queries + query_pe
+        attn_out = self.self_attn(q=queries, k=queries, v=queries)
+        queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class AttentionForTwoWayAttentionBlock(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.c_per_head = self.internal_dim / num_heads
+        self.inv_sqrt_c_per_head = 1.0 / math.sqrt(self.c_per_head)
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+        self._reset_parameters()
+    def _reset_parameters(self) -> None:
+        # The fan_out is incorrect, but matches pytorch's initialization
+        # for which qkv is a single 3*embedding_dim x embedding_dim matrix
+        fan_in = self.embedding_dim
+        fan_out = 3 * self.internal_dim
+        # Xavier uniform with our custom fan_out
+        bnd = math.sqrt(6 / (fan_in + fan_out))
+        nn.init.uniform_(self.q_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.k_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.v_proj.weight, -bnd, bnd)
+        # out_proj.weight is left with default initialization, like pytorch attention
+        nn.init.zeros_(self.q_proj.bias)
+        nn.init.zeros_(self.k_proj.bias)
+        nn.init.zeros_(self.v_proj.bias)
+        nn.init.zeros_(self.out_proj.bias)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn * self.inv_sqrt_c_per_head
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

model/configuration_evf.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Evf model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+EVF_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class EvfConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EvfSam`].
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        Example:
+    ```python
+    >>> configuration = EvfConfig()
+    >>> model = EvfSam(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "evf"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        hidden_size=768,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_scaling=None,
+        out_dim=256,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.out_dim = out_dim
+        # self.pretraining_tp = pretraining_tp
+        # self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")

model/evf_effisam.py ADDED Viewed

	@@ -0,0 +1,313 @@

+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, AutoConfig, AutoModelForCausalLM
+from .EfficientSAM.efficient_sam.build_efficient_sam import build_efficient_sam_vits, build_efficient_sam_vitt
+from .unilm.beit3.modeling_utils import BEiT3Wrapper, _get_base_config, _get_large_config
+from .configuration_evf import EvfConfig
+def dice_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+    scale=1000,  # 100000.0,
+    eps=1e-6,
+):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1, 2)
+    targets = targets.flatten(1, 2)
+    numerator = 2 * (inputs / scale * targets).sum(-1)
+    denominator = (inputs / scale).sum(-1) + (targets / scale).sum(-1)
+    loss = 1 - (numerator + eps) / (denominator + eps)
+    loss = loss.sum() / (num_masks + 1e-8)
+    return loss
+def sigmoid_ce_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    loss = loss.flatten(1, 2).mean(1).sum() / (num_masks + 1e-8)
+    return loss
+class EvfEffiSamModel(PreTrainedModel):
+    config_class = EvfConfig
+    def __init__(
+        self,
+        config,
+        **kwargs
+    ):
+        super(EvfEffiSamModel, self).__init__(config)
+        self.config = config
+        self.vision_pretrained = kwargs.get("vision_pretrained", None)
+        self.encoder_pretrained = kwargs.get("encoder_pretrained", None)
+        self.dice_loss_weight = kwargs.get("dice_loss_weight", None)
+        self.bce_loss_weight = kwargs.get("bce_loss_weight", None)
+        self.train_mask_decoder = kwargs.get("train_mask_decoder", False)
+        self.initialize_evf_modules(config)
+    def initialize_evf_modules(self, config):
+        # EffiSAM
+        if config.sam_scale=="tiny":
+            self.visual_model = build_efficient_sam_vitt(self.vision_pretrained)
+        elif config.sam_scale=="small":
+            # vits scale, or without pretrained weight (self.vision_pretrained=None)
+            self.visual_model = build_efficient_sam_vits(self.vision_pretrained)
+        else:
+            raise NotImplementedError
+        for param in self.visual_model.parameters():
+            param.requires_grad = False
+        if self.train_mask_decoder:
+            self.visual_model.mask_decoder.train()
+            for param in self.visual_model.mask_decoder.parameters():
+                param.requires_grad = True
+        # beit-3
+        if self.config.mm_extractor_scale == "base":
+            beit_config = _get_base_config()
+        elif self.config.mm_extractor_scale == "large":
+            beit_config = _get_large_config()
+        else:
+            raise AttributeError(f"model config should contain key 'mm_extractor_scale', with value 'base' or 'large'.")
+        self.mm_extractor = BEiT3Wrapper(beit_config)
+        if self.encoder_pretrained is not None:
+            beit_state_dict = torch.load(self.encoder_pretrained)["model"]
+            self.mm_extractor.load_state_dict(
+                beit_state_dict,
+                strict=False
+            )
+        for param in self.mm_extractor.parameters():
+            param.requires_grad = True
+        # Projection layer
+        in_dim = config.hidden_size
+        assert in_dim==beit_config.encoder_embed_dim, \
+            f"projection layer dim {in_dim} mismatch with mm_extractor dim {beit_config.encoder_embed_dim}"
+        out_dim = config.out_dim
+        text_fc = [
+            nn.Linear(in_dim, in_dim),
+            nn.ReLU(),
+            nn.Linear(in_dim, out_dim)
+        ]
+        self.text_hidden_fcs = nn.ModuleList([nn.Sequential(*text_fc)])
+        self.text_hidden_fcs.train()
+        for param in self.text_hidden_fcs.parameters():
+            param.requires_grad = True
+    def get_visual_embs(self, pixel_values: torch.Tensor):
+        with torch.no_grad():
+            image_embeddings_list = []
+            for i in range(pixel_values.shape[0]):
+                torch.cuda.empty_cache()
+                image_embeddings = self.visual_model.image_encoder(
+                    pixel_values[i].unsqueeze(0)
+                )
+                image_embeddings_list.append(image_embeddings)
+            torch.cuda.empty_cache()
+            image_embeddings = torch.cat(image_embeddings_list, 0)
+        return image_embeddings
+    def forward(
+        self,
+        images: torch.Tensor,
+        images_evf: torch.Tensor,
+        input_ids: torch.Tensor,
+        attention_masks: torch.Tensor,
+        offset: torch.Tensor,
+        masks_list: List[torch.Tensor],
+        label_list: List[torch.Tensor],
+        resize_list: List[tuple],
+        inference: bool = False,
+        **kwargs,
+    ):
+        image_embeddings = self.get_visual_embs(images)
+        batch_size = image_embeddings.shape[0]
+        assert batch_size == len(offset) - 1
+        images_evf_list = []
+        for i in range(len(offset) - 1):
+            start_i, end_i = offset[i], offset[i + 1]
+            images_evf_i = (
+                images_evf[i]
+                .unsqueeze(0)
+                .expand(end_i - start_i, -1, -1, -1)
+                .contiguous()
+            )
+            images_evf_list.append(images_evf_i)
+        images_evf = torch.cat(images_evf_list, dim=0)
+        multimask_output = False
+        output = self.mm_extractor.beit3(
+            visual_tokens=images_evf,
+            textual_tokens=input_ids,
+            text_padding_position=~attention_masks
+            )
+        feat = output["encoder_out"][:, :1, ...]
+        feat = self.text_hidden_fcs[0](feat)
+        feat = torch.split(feat, [offset[i+1] - offset[i] for i in range(len(offset)-1)])
+        pred_masks = []
+        for i in range(len(feat)):
+            sparse_embeddings = feat[i].unsqueeze(0)
+            sparse_embeddings = sparse_embeddings.to(feat[i].dtype)
+            low_res_masks, iou_predictions = self.visual_model.mask_decoder(
+                image_embeddings=image_embeddings[i].unsqueeze(0),
+                image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                multimask_output=multimask_output,
+            )
+            if multimask_output:
+                sorted_ids = torch.argsort(iou_predictions, dim=-1, descending=True)
+                low_res_masks = torch.take_along_dim(low_res_masks, sorted_ids[..., None, None], dim=1)
+            pred_mask = self.postprocess_masks(
+                low_res_masks[:, :1],
+                input_size=resize_list[i],
+                original_size=label_list[i].shape,
+            )
+            pred_masks.append(pred_mask[:, 0])
+        gt_masks = masks_list
+        if inference:
+            return {
+                "pred_masks": pred_masks,
+                "gt_masks": gt_masks,
+            }
+        mask_bce_loss = 0
+        mask_dice_loss = 0
+        num_masks = 0
+        for batch_idx in range(len(pred_masks)):
+            gt_mask = gt_masks[batch_idx]
+            pred_mask = pred_masks[batch_idx]
+            assert (
+                gt_mask.shape[0] == pred_mask.shape[0]
+            ), "gt_mask.shape: {}, pred_mask.shape: {}".format(
+                gt_mask.shape, pred_mask.shape
+            )
+            mask_bce_loss += (
+                sigmoid_ce_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            mask_dice_loss += (
+                dice_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            num_masks += gt_mask.shape[0]
+        mask_bce_loss = self.bce_loss_weight * mask_bce_loss / (num_masks + 1e-8)
+        mask_dice_loss = self.dice_loss_weight * mask_dice_loss / (num_masks + 1e-8)
+        mask_loss = mask_bce_loss + mask_dice_loss
+        loss = mask_loss
+        return {
+            "loss": loss,
+            "mask_bce_loss": mask_bce_loss,
+            "mask_dice_loss": mask_dice_loss,
+            "mask_loss": mask_loss,
+        }
+    def postprocess_masks(
+        self,
+        masks: torch.Tensor,
+        input_size: Tuple[int, ...],
+        original_size: Tuple[int, ...],
+    ) -> torch.Tensor:
+        """
+        pre-process of Effi-SAM is different from SAM, where there is no padding,
+        so cropping is not needed in post-process.
+        """
+        dtype = masks.dtype
+        # masks = F.interpolate(
+        #     masks.float(),
+        #     (1024, 1024),
+        #     mode="bilinear",
+        #     align_corners=False,
+        # )
+        # masks = masks.to(dtype)
+        # masks = masks[..., : input_size[0], : input_size[1]]
+        masks = F.interpolate(
+            masks, original_size, mode="bilinear", align_corners=False
+        )
+        masks = masks.to(dtype)
+        return masks
+    def inference(
+            self,
+            images,
+            images_evf,
+            input_ids,
+            resize_list,
+            original_size_list,
+            multimask_output=False,
+        ):
+        with torch.no_grad():
+            image_embeddings = self.visual_model.image_encoder(images)
+        output = self.mm_extractor.beit3(visual_tokens=images_evf, textual_tokens=input_ids, text_padding_position=torch.zeros_like(input_ids))
+        feat = output["encoder_out"][:, :1, ...]
+        feat = self.text_hidden_fcs[0](feat)
+        sparse_embeddings = feat.unsqueeze(0)
+        sparse_embeddings = sparse_embeddings.to(feat.dtype)
+        low_res_masks, iou_predictions = self.visual_model.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            multimask_output=multimask_output,
+        )
+        if multimask_output:
+            sorted_ids = torch.argsort(iou_predictions, dim=-1, descending=True)
+            low_res_masks = torch.take_along_dim(low_res_masks, sorted_ids[..., None, None], dim=1)
+        pred_mask = self.postprocess_masks(
+            low_res_masks[:, :1],
+            input_size=resize_list[0],
+            original_size=original_size_list[0],
+        )
+        return pred_mask[:, 0]
+AutoConfig.register("evf", EvfConfig)
+AutoModelForCausalLM.register(EvfConfig, EvfEffiSamModel)

model/evf_sam.py ADDED Viewed

	@@ -0,0 +1,303 @@

+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, AutoConfig, AutoModelForCausalLM
+from .segment_anything import build_sam_vit_h
+from .unilm.beit3.modeling_utils import BEiT3Wrapper, _get_base_config, _get_large_config
+from .configuration_evf import EvfConfig
+def dice_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+    scale=1000,  # 100000.0,
+    eps=1e-6,
+):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1, 2)
+    targets = targets.flatten(1, 2)
+    numerator = 2 * (inputs / scale * targets).sum(-1)
+    denominator = (inputs / scale).sum(-1) + (targets / scale).sum(-1)
+    loss = 1 - (numerator + eps) / (denominator + eps)
+    loss = loss.sum() / (num_masks + 1e-8)
+    return loss
+def sigmoid_ce_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    loss = loss.flatten(1, 2).mean(1).sum() / (num_masks + 1e-8)
+    return loss
+class EvfSamModel(PreTrainedModel):
+    config_class = EvfConfig
+    def __init__(
+        self,
+        config,
+        **kwargs
+    ):
+        super(EvfSamModel, self).__init__(config)
+        self.config = config
+        self.vision_pretrained = kwargs.get("vision_pretrained", None)
+        self.encoder_pretrained = kwargs.get("encoder_pretrained", None)
+        self.dice_loss_weight = kwargs.get("dice_loss_weight", None)
+        self.bce_loss_weight = kwargs.get("bce_loss_weight", None)
+        self.train_mask_decoder = kwargs.get("train_mask_decoder", False)
+        self.train_prompt_encoder = kwargs.get("train_prompt_encoder", False)
+        self.initialize_evf_modules(config)
+    def initialize_evf_modules(self, config):
+        # SAM
+        if config.sam_scale=="huge":
+            self.visual_model = build_sam_vit_h(self.vision_pretrained)
+        else:
+            raise NotImplementedError
+        for param in self.visual_model.parameters():
+            param.requires_grad = False
+        if self.train_mask_decoder:
+            self.visual_model.mask_decoder.train()
+            for param in self.visual_model.mask_decoder.parameters():
+                param.requires_grad = True
+        if self.train_prompt_encoder:
+            self.visual_model.prompt_encoder.no_mask_embed.requires_grad_(True)
+        # beit-3
+        if self.config.mm_extractor_scale == "base":
+            beit_config = _get_base_config()
+        elif self.config.mm_extractor_scale == "large":
+            beit_config = _get_large_config()
+        else:
+            raise AttributeError(f"model config should contain key 'mm_extractor_scale', with value 'base' or 'large'.")
+        self.mm_extractor = BEiT3Wrapper(beit_config)
+        if self.encoder_pretrained is not None:
+            beit_state_dict = torch.load(self.encoder_pretrained)["model"]
+            self.mm_extractor.load_state_dict(
+                beit_state_dict,
+                strict=False
+            )
+        for param in self.mm_extractor.parameters():
+            param.requires_grad = True
+        # Projection layer
+        in_dim = config.hidden_size
+        assert in_dim==beit_config.encoder_embed_dim, \
+            f"projection layer dim {in_dim} mismatch with mm_extractor dim {beit_config.encoder_embed_dim}"
+        out_dim = config.out_dim
+        text_fc = [
+            nn.Linear(in_dim, in_dim),
+            nn.ReLU(),
+            nn.Linear(in_dim, out_dim)
+        ]
+        self.text_hidden_fcs = nn.ModuleList([nn.Sequential(*text_fc)])
+        self.text_hidden_fcs.train()
+        for param in self.text_hidden_fcs.parameters():
+            param.requires_grad = True
+    def get_visual_embs(self, pixel_values: torch.FloatTensor):
+        with torch.no_grad():
+            image_embeddings_list = []
+            for i in range(pixel_values.shape[0]):
+                torch.cuda.empty_cache()
+                image_embeddings = self.visual_model.image_encoder(
+                    pixel_values[i].unsqueeze(0)
+                )
+                image_embeddings_list.append(image_embeddings)
+            torch.cuda.empty_cache()
+            image_embeddings = torch.cat(image_embeddings_list, 0)
+        return image_embeddings
+    def forward(
+        self,
+        images: torch.FloatTensor,
+        images_evf: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_masks: torch.LongTensor,
+        offset: torch.LongTensor,
+        masks_list: List[torch.FloatTensor],
+        label_list: List[torch.Tensor],
+        resize_list: List[tuple],
+        inference: bool = False,
+        **kwargs,
+    ):
+        image_embeddings = self.get_visual_embs(images)
+        batch_size = image_embeddings.shape[0]
+        assert batch_size == len(offset) - 1
+        images_evf_list = []
+        for i in range(len(offset) - 1):
+            start_i, end_i = offset[i], offset[i + 1]
+            images_evf_i = (
+                images_evf[i]
+                .unsqueeze(0)
+                .expand(end_i - start_i, -1, -1, -1)
+                .contiguous()
+            )
+            images_evf_list.append(images_evf_i)
+        images_evf = torch.cat(images_evf_list, dim=0)
+        multimask_output = False
+        output = self.mm_extractor.beit3(
+            visual_tokens=images_evf,
+            textual_tokens=input_ids,
+            text_padding_position=~attention_masks
+            )
+        feat = output["encoder_out"][:, :1, ...]
+        feat = self.text_hidden_fcs[0](feat)
+        feat = torch.split(feat, [offset[i+1] - offset[i] for i in range(len(offset)-1)])
+        pred_masks = []
+        for i in range(len(feat)):
+            (
+                sparse_embeddings,
+                dense_embeddings,
+            ) = self.visual_model.prompt_encoder(
+                points=None,
+                boxes=None,
+                masks=None,
+                text_embeds=feat[i],
+            )
+            sparse_embeddings = sparse_embeddings.to(feat[i].dtype)
+            low_res_masks, iou_predictions = self.visual_model.mask_decoder(
+                image_embeddings=image_embeddings[i].unsqueeze(0),
+                image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                dense_prompt_embeddings=dense_embeddings,
+                multimask_output=multimask_output,
+            )
+            if multimask_output:
+                sorted_ids = torch.argsort(iou_predictions, dim=-1, descending=True)
+                low_res_masks = torch.take_along_dim(low_res_masks, sorted_ids[..., None, None], dim=1)[:, :1]
+            pred_mask = self.visual_model.postprocess_masks(
+                low_res_masks,
+                input_size=resize_list[i],
+                original_size=label_list[i].shape,
+            )
+            pred_masks.append(pred_mask[:, 0])
+        gt_masks = masks_list
+        if inference:
+            return {
+                "pred_masks": pred_masks,
+                "gt_masks": gt_masks,
+            }
+        mask_bce_loss = 0
+        mask_dice_loss = 0
+        num_masks = 0
+        for batch_idx in range(len(pred_masks)):
+            gt_mask = gt_masks[batch_idx]
+            pred_mask = pred_masks[batch_idx]
+            assert (
+                gt_mask.shape[0] == pred_mask.shape[0]
+            ), "gt_mask.shape: {}, pred_mask.shape: {}".format(
+                gt_mask.shape, pred_mask.shape
+            )
+            mask_bce_loss += (
+                sigmoid_ce_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            mask_dice_loss += (
+                dice_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            num_masks += gt_mask.shape[0]
+        mask_bce_loss = self.bce_loss_weight * mask_bce_loss / (num_masks + 1e-8)
+        mask_dice_loss = self.dice_loss_weight * mask_dice_loss / (num_masks + 1e-8)
+        mask_loss = mask_bce_loss + mask_dice_loss
+        loss = mask_loss
+        return {
+            "loss": loss,
+            "mask_bce_loss": mask_bce_loss,
+            "mask_dice_loss": mask_dice_loss,
+            "mask_loss": mask_loss,
+        }
+    def inference(
+            self,
+            images,
+            images_evf,
+            input_ids,
+            resize_list,
+            original_size_list,
+            multimask_output=False,
+        ):
+        with torch.no_grad():
+            image_embeddings = self.visual_model.image_encoder(images)
+        multimask_output = multimask_output
+        output = self.mm_extractor.beit3(visual_tokens=images_evf, textual_tokens=input_ids, text_padding_position=torch.zeros_like(input_ids))
+        feat = output["encoder_out"][:, :1, ...]
+        feat = self.text_hidden_fcs[0](feat)
+        (
+            sparse_embeddings,
+            dense_embeddings,
+        ) = self.visual_model.prompt_encoder(
+            points=None,
+            boxes=None,
+            masks=None,
+            text_embeds=feat,
+        )
+        sparse_embeddings = sparse_embeddings.to(feat.dtype)
+        low_res_masks, iou_predictions = self.visual_model.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+        if multimask_output:
+            sorted_ids = torch.argsort(iou_predictions, dim=-1, descending=True)
+            low_res_masks = torch.take_along_dim(low_res_masks, sorted_ids[..., None, None], dim=1)[:, :1]
+        pred_mask = self.visual_model.postprocess_masks(
+            low_res_masks,
+            input_size=resize_list[0],
+            original_size=original_size_list[0],
+        )
+        return pred_mask[:, 0]
+AutoConfig.register("evf", EvfConfig)
+AutoModelForCausalLM.register(EvfConfig, EvfSamModel)

model/segment_anything/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .automatic_mask_generator import SamAutomaticMaskGenerator
+from .build_sam import (build_sam, build_sam_vit_b, build_sam_vit_h,
+                        build_sam_vit_l, sam_model_registry)
+from .predictor import SamPredictor

model/segment_anything/automatic_mask_generator.py ADDED Viewed

	@@ -0,0 +1,372 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+from .modeling import Sam
+from .predictor import SamPredictor
+from .utils.amg import (MaskData, area_from_rle, batch_iterator,
+                        batched_mask_to_box, box_xyxy_to_xywh,
+                        build_all_layer_point_grids, calculate_stability_score,
+                        coco_encode_rle, generate_crop_boxes,
+                        is_box_near_crop_edge, mask_to_rle_pytorch,
+                        remove_small_regions, rle_to_mask, uncrop_boxes_xyxy,
+                        uncrop_masks, uncrop_points)
+class SamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import \
+                mask as mask_utils  # type: ignore # noqa: F401
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+        self.predictor = SamPredictor(model)
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+        # Generate masks
+        mask_data = self._generate_masks(image)
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [
+                coco_encode_rle(rle) for rle in mask_data["rles"]
+            ]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+        return curr_anns
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[:2]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+        data.to_numpy()
+        return data
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[:2]
+        self.predictor.set_image(cropped_im)
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] * points_scale
+        # Generate masks for this crop in batches
+        data = MaskData()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(
+                points, cropped_im_size, crop_box, orig_size
+            )
+            data.cat(batch_data)
+            del batch_data
+        self.predictor.reset_image()
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros_like(data["boxes"][:, 0]),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        data.filter(keep_by_nms)
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["points"] = uncrop_points(data["points"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+        return data
+    def _process_batch(
+        self,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+        # Run model on this batch
+        transformed_points = self.predictor.transform.apply_coords(points, im_size)
+        in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
+        in_labels = torch.ones(
+            in_points.shape[0], dtype=torch.int, device=in_points.device
+        )
+        masks, iou_preds, _ = self.predictor.predict_torch(
+            in_points[:, None, :],
+            in_labels[:, None],
+            multimask_output=True,
+            return_logits=True,
+        )
+        # Serialize predictions and store in MaskData
+        data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+        )
+        del masks
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"],
+            self.predictor.model.mask_threshold,
+            self.stability_score_offset,
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > self.predictor.model.mask_threshold
+        data["boxes"] = batched_mask_to_box(data["masks"])
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(
+            data["boxes"], crop_box, [0, 0, orig_w, orig_h]
+        )
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+        return data
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+        Edits mask_data in place.
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+        return mask_data

model/segment_anything/build_sam.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+import torch
+from .modeling import (ImageEncoderViT, MaskDecoder, PromptEncoder, Sam,
+                       TwoWayTransformer)
+def build_sam_vit_h(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=1280,
+        encoder_depth=32,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[7, 15, 23, 31],
+        checkpoint=checkpoint,
+    )
+build_sam = build_sam_vit_h
+def build_sam_vit_l(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=1024,
+        encoder_depth=24,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[5, 11, 17, 23],
+        checkpoint=checkpoint,
+    )
+def build_sam_vit_b(checkpoint=None):
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+    )
+sam_model_registry = {
+    "default": build_sam_vit_h,
+    "vit_h": build_sam_vit_h,
+    "vit_l": build_sam_vit_l,
+    "vit_b": build_sam_vit_b,
+}
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    checkpoint=None,
+):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    sam = Sam(
+        image_encoder=ImageEncoderViT(
+            depth=encoder_depth,
+            embed_dim=encoder_embed_dim,
+            img_size=image_size,
+            mlp_ratio=4,
+            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+            num_heads=encoder_num_heads,
+            patch_size=vit_patch_size,
+            qkv_bias=True,
+            use_rel_pos=True,
+            global_attn_indexes=encoder_global_attn_indexes,
+            window_size=14,
+            out_chans=prompt_embed_dim,
+        ),
+        prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+        ),
+        mask_decoder=MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+        ),
+        pixel_mean=[123.675, 116.28, 103.53],
+        pixel_std=[58.395, 57.12, 57.375],
+    )
+    sam.eval()
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)
+        sam.load_state_dict(state_dict, strict=False)
+    return sam

model/segment_anything/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .image_encoder import ImageEncoderViT
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .sam import Sam
+from .transformer import TwoWayTransformer

model/segment_anything/modeling/common.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Type
+import torch
+import torch.nn as nn
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

model/segment_anything/modeling/image_encoder.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .common import LayerNorm2d, MLPBlock
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.out_chans = out_chans
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1, img_size // patch_size, img_size // patch_size, embed_dim
+                )
+            )
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        for blk in self.blocks:
+            x = blk(x)
+        dtype = x.dtype
+        if dtype == torch.float16:  # prevent overflow
+            with torch.autocast(device_type="cuda", dtype=torch.float32):
+                x = self.neck(x.permute(0, 3, 1, 2))
+            x = x.to(dtype)
+        else:
+            x = self.neck(x.permute(0, 3, 1, 2))
+        return x
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(
+            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
+        )
+        self.window_size = window_size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+        return x
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = (
+            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(
+                attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
+            )
+        attn = attn.softmax(dim=-1)
+        x = (
+            (attn @ v)
+            .view(B, self.num_heads, H, W, -1)
+            .permute(0, 2, 3, 1, 4)
+            .reshape(B, H, W, -1)
+        )
+        x = self.proj(x)
+        return x
+def window_partition(
+    x: torch.Tensor, window_size: int
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: Tuple[int, int],
+    hw: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w)
+        + rel_h[:, :, :, :, None]
+        + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

model/segment_anything/modeling/mask_decoder.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple, Type
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .common import LayerNorm2d
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLP(
+            transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+        )
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, mask_slice, :, :]
+        iou_pred = iou_pred[:, mask_slice]
+        # Prepare output
+        return masks, iou_pred
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # image_embeddings: [1, C, H, W], tokens: [B, N, C]
+        # dense_prompt_embeddings: [B, C, H, W]
+        # Expand per-image data in batch direction to be per-mask
+        src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        src = src + dense_prompt_embeddings
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(
+                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+            )
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(
+            b, self.num_mask_tokens, h, w
+        )
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x

model/segment_anything/modeling/prompt_encoder.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Optional, Tuple, Type
+import numpy as np
+import torch
+from torch import nn
+from .common import LayerNorm2d
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [
+            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
+        ]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+        self.mask_input_size = (
+            4 * image_embedding_size[0],
+            4 * image_embedding_size[1],
+        )
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        return point_embedding
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(
+            coords, self.input_image_size
+        )
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+        text_embeds: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        elif text_embeds is not None:
+            return text_embeds.shape[0]
+        else:
+            return 1
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+        text_embeds: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        bs = self._get_batch_size(points, boxes, masks, text_embeds)
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim), device=self._get_device()
+        )
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+        if text_embeds is not None:
+            sparse_embeddings = torch.cat([sparse_embeddings, text_embeds], dim=1)
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+        return sparse_embeddings, dense_embeddings
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        if coords.dtype != self.positional_encoding_gaussian_matrix.dtype:
+            coords = coords.to(self.positional_encoding_gaussian_matrix.dtype)
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones(
+            (h, w), device=device, dtype=self.positional_encoding_gaussian_matrix.dtype
+        )
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C

model/segment_anything/modeling/sam.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Dict, List, Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .image_encoder import ImageEncoderViT
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+class Sam(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = [123.675, 116.28, 103.53],
+        pixel_std: List[float] = [58.395, 57.12, 57.375],
+    ) -> None:
+        """
+        SAM predicts object masks from an image and input prompts.
+        Arguments:
+          image_encoder (ImageEncoderViT): The backbone used to encode the
+            image into image embeddings that allow for efficient mask prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+        """
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.prompt_encoder = prompt_encoder
+        self.mask_decoder = mask_decoder
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False
+        )
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+    @property
+    def device(self) -> Any:
+        return self.pixel_mean.device
+    @torch.no_grad()
+    def forward(
+        self,
+        batched_input: List[Dict[str, Any]],
+        multimask_output: bool,
+    ) -> List[Dict[str, torch.Tensor]]:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_input (list(dict)): A list over input images, each a
+            dictionary with the following keys. A prompt key can be
+            excluded if it is not present.
+              'image': The image as a torch tensor in 3xHxW format,
+                already transformed for input to the model.
+              'original_size': (tuple(int, int)) The original size of
+                the image before transformation, as (H, W).
+              'point_coords': (torch.Tensor) Batched point prompts for
+                this image, with shape BxNx2. Already transformed to the
+                input frame of the model.
+              'point_labels': (torch.Tensor) Batched labels for point prompts,
+                with shape BxN.
+              'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
+                Already transformed to the input frame of the model.
+              'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
+                in the form Bx1xHxW.
+          multimask_output (bool): Whether the model should predict multiple
+            disambiguating masks, or return a single mask.
+        Returns:
+          (list(dict)): A list over input images, where each element is
+            as dictionary with the following keys.
+              'masks': (torch.Tensor) Batched binary mask predictions,
+                with shape BxCxHxW, where B is the number of input prompts,
+                C is determined by multimask_output, and (H, W) is the
+                original size of the image.
+              'iou_predictions': (torch.Tensor) The model's predictions
+                of mask quality, in shape BxC.
+              'low_res_logits': (torch.Tensor) Low resolution logits with
+                shape BxCxHxW, where H=W=256. Can be passed as mask input
+                to subsequent iterations of prediction.
+        """
+        input_images = torch.stack(
+            [self.preprocess(x["image"]) for x in batched_input], dim=0
+        )
+        image_embeddings = self.image_encoder(input_images)
+        outputs = []
+        for image_record, curr_embedding in zip(batched_input, image_embeddings):
+            if "point_coords" in image_record:
+                points = (image_record["point_coords"], image_record["point_labels"])
+            else:
+                points = None
+            sparse_embeddings, dense_embeddings = self.prompt_encoder(
+                points=points,
+                boxes=image_record.get("boxes", None),
+                masks=image_record.get("mask_inputs", None),
+            )
+            low_res_masks, iou_predictions = self.mask_decoder(
+                image_embeddings=curr_embedding.unsqueeze(0),
+                image_pe=self.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                dense_prompt_embeddings=dense_embeddings,
+                multimask_output=multimask_output,
+            )
+            masks = self.postprocess_masks(
+                low_res_masks,
+                input_size=image_record["image"].shape[-2:],
+                original_size=image_record["original_size"],
+            )
+            masks = masks > self.mask_threshold
+            outputs.append(
+                {
+                    "masks": masks,
+                    "iou_predictions": iou_predictions,
+                    "low_res_logits": low_res_masks,
+                }
+            )
+        return outputs
+    def postprocess_masks(
+        self,
+        masks: torch.Tensor,
+        input_size: Tuple[int, ...],
+        original_size: Tuple[int, ...],
+    ) -> torch.Tensor:
+        """
+        Remove padding and upscale masks to the original image size.
+        Arguments:
+          masks (torch.Tensor): Batched masks from the mask_decoder,
+            in BxCxHxW format.
+          input_size (tuple(int, int)): The size of the image input to the
+            model, in (H, W) format. Used to remove padding.
+          original_size (tuple(int, int)): The original size of the image
+            before resizing for input to the model, in (H, W) format.
+        Returns:
+          (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
+            is given by original_size.
+        """
+        dtype = masks.dtype
+        masks = F.interpolate(
+            masks.float(),
+            (self.image_encoder.img_size, self.image_encoder.img_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        # masks = masks.to(dtype)
+        masks = masks[..., : input_size[0], : input_size[1]]
+        masks = F.interpolate(
+            masks, original_size, mode="bilinear", align_corners=False
+        )
+        return masks
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        # Normalize colors
+        x = (x - self.pixel_mean) / self.pixel_std
+        # Pad
+        h, w = x.shape[-2:]
+        padh = self.image_encoder.img_size - h
+        padw = self.image_encoder.img_size - w
+        x = F.pad(x, (0, padw, 0, padh))
+        return x

model/segment_anything/modeling/transformer.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Tuple, Type
+import torch
+from torch import Tensor, nn
+from .common import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

model/segment_anything/predictor.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Tuple
+import numpy as np
+import torch
+from .modeling import Sam
+from .utils.transforms import ResizeLongestSide
+class SamPredictor:
+    def __init__(
+        self,
+        sam_model: Sam,
+    ) -> None:
+        """
+        Uses SAM to calculate the image embedding for an image, and then
+        allow repeated, efficient mask prediction given prompts.
+        Arguments:
+          sam_model (Sam): The model to use for mask prediction.
+        """
+        super().__init__()
+        self.model = sam_model
+        self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
+        self.reset_image()
+    def set_image(
+        self,
+        image: np.ndarray,
+        image_format: str = "RGB",
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+        Arguments:
+          image (np.ndarray): The image for calculating masks. Expects an
+            image in HWC uint8 format, with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        assert image_format in [
+            "RGB",
+            "BGR",
+        ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+        if image_format != self.model.image_format:
+            image = image[..., ::-1]
+        # Transform the image to the form expected by the model
+        input_image = self.transform.apply_image(image)
+        input_image_torch = torch.as_tensor(input_image, device=self.device)
+        input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[
+            None, :, :, :
+        ]
+        self.set_torch_image(input_image_torch, image.shape[:2])
+    @torch.no_grad()
+    def set_torch_image(
+        self,
+        transformed_image: torch.Tensor,
+        original_image_size: Tuple[int, ...],
+    ) -> None:
+        """
+        Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method. Expects the input
+        image to be already transformed to the format expected by the model.
+        Arguments:
+          transformed_image (torch.Tensor): The input image, with shape
+            1x3xHxW, which has been transformed with ResizeLongestSide.
+          original_image_size (tuple(int, int)): The size of the image
+            before transformation, in (H, W) format.
+        """
+        assert (
+            len(transformed_image.shape) == 4
+            and transformed_image.shape[1] == 3
+            and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size
+        ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}."
+        self.reset_image()
+        self.original_size = original_image_size
+        self.input_size = tuple(transformed_image.shape[-2:])
+        input_image = self.model.preprocess(transformed_image)
+        self.features = self.model.image_encoder(input_image)
+        self.is_image_set = True
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        # Transform input prompts
+        coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            point_coords = self.transform.apply_coords(point_coords, self.original_size)
+            coords_torch = torch.as_tensor(
+                point_coords, dtype=torch.float, device=self.device
+            )
+            labels_torch = torch.as_tensor(
+                point_labels, dtype=torch.int, device=self.device
+            )
+            coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+        if box is not None:
+            box = self.transform.apply_boxes(box, self.original_size)
+            box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
+            box_torch = box_torch[None, :]
+        if mask_input is not None:
+            mask_input_torch = torch.as_tensor(
+                mask_input, dtype=torch.float, device=self.device
+            )
+            mask_input_torch = mask_input_torch[None, :, :, :]
+        masks, iou_predictions, low_res_masks = self.predict_torch(
+            coords_torch,
+            labels_torch,
+            box_torch,
+            mask_input_torch,
+            multimask_output,
+            return_logits=return_logits,
+        )
+        masks_np = masks[0].detach().cpu().numpy()
+        iou_predictions_np = iou_predictions[0].detach().cpu().numpy()
+        low_res_masks_np = low_res_masks[0].detach().cpu().numpy()
+        return masks_np, iou_predictions_np, low_res_masks_np
+    @torch.no_grad()
+    def predict_torch(
+        self,
+        point_coords: Optional[torch.Tensor],
+        point_labels: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        mask_input: Optional[torch.Tensor] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using ResizeLongestSide.
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if not self.is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) before mask prediction."
+            )
+        if point_coords is not None:
+            points = (point_coords, point_labels)
+        else:
+            points = None
+        # Embed prompts
+        sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=mask_input,
+        )
+        # Predict masks
+        low_res_masks, iou_predictions = self.model.mask_decoder(
+            image_embeddings=self.features,
+            image_pe=self.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+        # Upscale the masks to the original image resolution
+        masks = self.model.postprocess_masks(
+            low_res_masks, self.input_size, self.original_size
+        )
+        if not return_logits:
+            masks = masks > self.model.mask_threshold
+        return masks, iou_predictions, low_res_masks
+    def get_image_embedding(self) -> torch.Tensor:
+        """
+        Returns the image embeddings for the currently set image, with
+        shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+        the embedding spatial dimension of SAM (typically C=256, H=W=64).
+        """
+        if not self.is_image_set:
+            raise RuntimeError(
+                "An image must be set with .set_image(...) to generate an embedding."
+            )
+        assert (
+            self.features is not None
+        ), "Features must exist if an image has been set."
+        return self.features
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+    def reset_image(self) -> None:
+        """Resets the currently set image."""
+        self.is_image_set = False
+        self.features = None
+        self.orig_h = None
+        self.orig_w = None
+        self.input_h = None
+        self.input_w = None

model/segment_anything/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

model/segment_anything/utils/amg.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+import numpy as np
+import torch
+class MaskData:
+    """
+    A structure for storing masks and their related data in batched format.
+    Implements basic filtering and concatenation.
+    """
+    def __init__(self, **kwargs) -> None:
+        for v in kwargs.values():
+            assert isinstance(
+                v, (list, np.ndarray, torch.Tensor)
+            ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats = dict(**kwargs)
+    def __setitem__(self, key: str, item: Any) -> None:
+        assert isinstance(
+            item, (list, np.ndarray, torch.Tensor)
+        ), "MaskData only supports list, numpy arrays, and torch tensors."
+        self._stats[key] = item
+    def __delitem__(self, key: str) -> None:
+        del self._stats[key]
+    def __getitem__(self, key: str) -> Any:
+        return self._stats[key]
+    def items(self) -> ItemsView[str, Any]:
+        return self._stats.items()
+    def filter(self, keep: torch.Tensor) -> None:
+        for k, v in self._stats.items():
+            if v is None:
+                self._stats[k] = None
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = v[keep.detach().cpu().numpy()]
+            elif isinstance(v, list) and keep.dtype == torch.bool:
+                self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+            elif isinstance(v, list):
+                self._stats[k] = [v[i] for i in keep]
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+    def cat(self, new_stats: "MaskData") -> None:
+        for k, v in new_stats.items():
+            if k not in self._stats or self._stats[k] is None:
+                self._stats[k] = deepcopy(v)
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+            elif isinstance(v, list):
+                self._stats[k] = self._stats[k] + deepcopy(v)
+            else:
+                raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+    def to_numpy(self) -> None:
+        for k, v in self._stats.items():
+            if isinstance(v, torch.Tensor):
+                self._stats[k] = v.detach().cpu().numpy()
+def is_box_near_crop_edge(
+    boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+    orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+    boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+    near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+    near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+    return torch.any(near_crop_edge, dim=1)
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+    box_xywh = deepcopy(box_xyxy)
+    box_xywh[2] = box_xywh[2] - box_xywh[0]
+    box_xywh[3] = box_xywh[3] - box_xywh[1]
+    return box_xywh
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+    assert len(args) > 0 and all(
+        len(a) == len(args[0]) for a in args
+    ), "Batched iteration must have inputs of all the same size."
+    n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+    for b in range(n_batches):
+        yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+    """
+    Encodes masks to an uncompressed RLE, in the format expected by
+    pycoco tools.
+    """
+    # Put in fortran order and flatten h,w
+    b, h, w = tensor.shape
+    tensor = tensor.permute(0, 2, 1).flatten(1)
+    # Compute change indices
+    diff = tensor[:, 1:] ^ tensor[:, :-1]
+    change_indices = diff.nonzero()
+    # Encode run length
+    out = []
+    for i in range(b):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+        cur_idxs = torch.cat(
+            [
+                torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+                cur_idxs + 1,
+                torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+            ]
+        )
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if tensor[i, 0] == 0 else [0]
+        counts.extend(btw_idxs.detach().cpu().tolist())
+        out.append({"size": [h, w], "counts": counts})
+    return out
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+    """Compute a binary mask from an uncompressed RLE."""
+    h, w = rle["size"]
+    mask = np.empty(h * w, dtype=bool)
+    idx = 0
+    parity = False
+    for count in rle["counts"]:
+        mask[idx : idx + count] = parity
+        idx += count
+        parity ^= True
+    mask = mask.reshape(w, h)
+    return mask.transpose()  # Put in C order
+def area_from_rle(rle: Dict[str, Any]) -> int:
+    return sum(rle["counts"][1::2])
+def calculate_stability_score(
+    masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+    """
+    Computes the stability score for a batch of masks. The stability
+    score is the IoU between the binary masks obtained by thresholding
+    the predicted mask logits at high and low values.
+    """
+    # One mask is always contained inside the other.
+    # Save memory by preventing unnecessary cast to torch.int64
+    intersections = (
+        (masks > (mask_threshold + threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    unions = (
+        (masks > (mask_threshold - threshold_offset))
+        .sum(-1, dtype=torch.int16)
+        .sum(-1, dtype=torch.int32)
+    )
+    return intersections / unions
+def build_point_grid(n_per_side: int) -> np.ndarray:
+    """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+    offset = 1 / (2 * n_per_side)
+    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+    points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+    return points
+def build_all_layer_point_grids(
+    n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+    """Generates point grids for all crop layers."""
+    points_by_layer = []
+    for i in range(n_layers + 1):
+        n_points = int(n_per_side / (scale_per_layer**i))
+        points_by_layer.append(build_point_grid(n_points))
+    return points_by_layer
+def generate_crop_boxes(
+    im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+    """
+    Generates a list of crop boxes of different sizes. Each layer
+    has (2**i)**2 boxes for the ith layer.
+    """
+    crop_boxes, layer_idxs = [], []
+    im_h, im_w = im_size
+    short_side = min(im_h, im_w)
+    # Original image
+    crop_boxes.append([0, 0, im_w, im_h])
+    layer_idxs.append(0)
+    def crop_len(orig_len, n_crops, overlap):
+        return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+    for i_layer in range(n_layers):
+        n_crops_per_side = 2 ** (i_layer + 1)
+        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+        crop_w = crop_len(im_w, n_crops_per_side, overlap)
+        crop_h = crop_len(im_h, n_crops_per_side, overlap)
+        crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+        crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+        # Crops in XYWH format
+        for x0, y0 in product(crop_box_x0, crop_box_y0):
+            box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+            crop_boxes.append(box)
+            layer_idxs.append(i_layer + 1)
+    return crop_boxes, layer_idxs
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return boxes + offset
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0]], device=points.device)
+    # Check if points has a channel dimension
+    if len(points.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return points + offset
+def uncrop_masks(
+    masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+    x0, y0, x1, y1 = crop_box
+    if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+    pad = (x0, pad_x - x0, y0, pad_y - y0)
+    return torch.nn.functional.pad(masks, pad, value=0)
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+    from pycocotools import mask as mask_utils  # type: ignore
+    h, w = uncompressed_rle["size"]
+    rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+    rle["counts"] = rle["counts"].decode("utf-8")  # Necessary to serialize with json
+    return rle
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+    an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+    """
+    # torch.max below raises an error on empty inputs, just skip in this case
+    if torch.numel(masks) == 0:
+        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+    # Normalize shape to CxHxW
+    shape = masks.shape
+    h, w = shape[-2:]
+    if len(shape) > 2:
+        masks = masks.flatten(0, -3)
+    else:
+        masks = masks.unsqueeze(0)
+    # Get top and bottom edges
+    in_height, _ = torch.max(masks, dim=-1)
+    in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+    in_height_coords = in_height_coords + h * (~in_height)
+    top_edges, _ = torch.min(in_height_coords, dim=-1)
+    # Get left and right edges
+    in_width, _ = torch.max(masks, dim=-2)
+    in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+    right_edges, _ = torch.max(in_width_coords, dim=-1)
+    in_width_coords = in_width_coords + w * (~in_width)
+    left_edges, _ = torch.min(in_width_coords, dim=-1)
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+    out = out * (~empty_filter).unsqueeze(-1)
+    # Return to original shape
+    if len(shape) > 2:
+        out = out.reshape(*shape[:-2], 4)
+    else:
+        out = out[0]
+    return out

model/segment_anything/utils/onnx.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from ..modeling import Sam
+from .amg import calculate_stability_score
+class SamOnnxModel(nn.Module):
+    """
+    This model should not be called directly, but is used in ONNX export.
+    It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
+    with some functions modified to enable model tracing. Also supports extra
+    options controlling what information. See the ONNX export script for details.
+    """
+    def __init__(
+        self,
+        model: Sam,
+        return_single_mask: bool,
+        use_stability_score: bool = False,
+        return_extra_metrics: bool = False,
+    ) -> None:
+        super().__init__()
+        self.mask_decoder = model.mask_decoder
+        self.model = model
+        self.img_size = model.image_encoder.img_size
+        self.return_single_mask = return_single_mask
+        self.use_stability_score = use_stability_score
+        self.stability_score_offset = 1.0
+        self.return_extra_metrics = return_extra_metrics
+    @staticmethod
+    def resize_longest_image_size(
+        input_image_size: torch.Tensor, longest_side: int
+    ) -> torch.Tensor:
+        input_image_size = input_image_size.to(torch.float32)
+        scale = longest_side / torch.max(input_image_size)
+        transformed_size = scale * input_image_size
+        transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
+        return transformed_size
+    def _embed_points(
+        self, point_coords: torch.Tensor, point_labels: torch.Tensor
+    ) -> torch.Tensor:
+        point_coords = point_coords + 0.5
+        point_coords = point_coords / self.img_size
+        point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
+        point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
+        point_embedding = point_embedding * (point_labels != -1)
+        point_embedding = (
+            point_embedding
+            + self.model.prompt_encoder.not_a_point_embed.weight * (point_labels == -1)
+        )
+        for i in range(self.model.prompt_encoder.num_point_embeddings):
+            point_embedding = (
+                point_embedding
+                + self.model.prompt_encoder.point_embeddings[i].weight
+                * (point_labels == i)
+            )
+        return point_embedding
+    def _embed_masks(
+        self, input_mask: torch.Tensor, has_mask_input: torch.Tensor
+    ) -> torch.Tensor:
+        mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(
+            input_mask
+        )
+        mask_embedding = mask_embedding + (
+            1 - has_mask_input
+        ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
+        return mask_embedding
+    def mask_postprocessing(
+        self, masks: torch.Tensor, orig_im_size: torch.Tensor
+    ) -> torch.Tensor:
+        masks = F.interpolate(
+            masks,
+            size=(self.img_size, self.img_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(
+            torch.int64
+        )
+        masks = masks[..., : prepadded_size[0], : prepadded_size[1]]  # type: ignore
+        orig_im_size = orig_im_size.to(torch.int64)
+        h, w = orig_im_size[0], orig_im_size[1]
+        masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
+        return masks
+    def select_masks(
+        self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Determine if we should return the multiclick mask or not from the number of points.
+        # The reweighting is used to avoid control flow.
+        score_reweight = torch.tensor(
+            [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
+        ).to(iou_preds.device)
+        score = iou_preds + (num_points - 2.5) * score_reweight
+        best_idx = torch.argmax(score, dim=1)
+        masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
+        iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
+        return masks, iou_preds
+    @torch.no_grad()
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        point_coords: torch.Tensor,
+        point_labels: torch.Tensor,
+        mask_input: torch.Tensor,
+        has_mask_input: torch.Tensor,
+        orig_im_size: torch.Tensor,
+    ):
+        sparse_embedding = self._embed_points(point_coords, point_labels)
+        dense_embedding = self._embed_masks(mask_input, has_mask_input)
+        masks, scores = self.model.mask_decoder.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=self.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embedding,
+            dense_prompt_embeddings=dense_embedding,
+        )
+        if self.use_stability_score:
+            scores = calculate_stability_score(
+                masks, self.model.mask_threshold, self.stability_score_offset
+            )
+        if self.return_single_mask:
+            masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
+        upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
+        if self.return_extra_metrics:
+            stability_scores = calculate_stability_score(
+                upscaled_masks, self.model.mask_threshold, self.stability_score_offset
+            )
+            areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
+            return upscaled_masks, scores, stability_scores, areas, masks
+        return upscaled_masks, scores, masks

model/segment_anything/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from copy import deepcopy
+from typing import Tuple
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize  # type: ignore
+from torchvision.transforms.functional import to_pil_image
+class ResizeLongestSide:
+    """
+    Resizes images to the longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.target_length
+        )
+        return np.array(resize(to_pil_image(image), target_size))
+    def apply_coords(
+        self, coords: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes(
+        self, boxes: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.target_length
+        )
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    @staticmethod
+    def get_preprocess_shape(
+        oldh: int, oldw: int, long_side_length: int
+    ) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)

model/unilm/beit3/README.md ADDED Viewed

	@@ -0,0 +1,191 @@

+# [(BEiT-3) Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks](https://arxiv.org/abs/2208.10442)
+Official PyTorch implementation and pretrained models of BEiT-3.
+The code and pretrained models of **BEiT** can be found at [here](https://github.com/microsoft/unilm/tree/master/beit).
+The code and pretrained models of **BEiT v2** can be found at [here](https://github.com/microsoft/unilm/tree/master/beit2).
+- March, 2023: release [the code and pretrained models of **BEiT-3**](https://github.com/microsoft/unilm/tree/master/beit3)
+- March, 2023: [**BEiT-3**](https://arxiv.org/abs/2208.10442) was accepted by **CVPR 2023**.
+- Sept 2022: release [the code and pretrained models of **BEiT v2**](https://github.com/microsoft/unilm/tree/master/beit2)
+- Aug 2022: release preprint [Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks](https://arxiv.org/abs/2208.10442)
+- Aug 2022: release preprint [BEiT v2: Masked Image Modeling with Vector-Quantized Visual Tokenizers](https://arxiv.org/abs/2208.06366)
+- June 2022: release preprint [VL-BEiT: Generative Vision-Language Pretraining](https://arxiv.org/abs/2206.01127)
+- March, 2022: add [linear probe examples](https://github.com/microsoft/unilm/blob/master/beit/get_started_for_image_classification.md#example-linear-probe-on-imagenet)
+- January, 2022: [**BEiT**](https://openreview.net/forum?id=p-BhZSz59o4) was accepted by **ICLR 2022 as Oral presentation** (54 out of 3391).
+- August 2021: [**BEiT**](https://huggingface.co/transformers/master/model_doc/beit.html) is on [HuggingFace](https://github.com/huggingface/transformers)
+- July 2021: BEiT-large achieves **[state-of-the-art results on ADE20K](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k) (a big jump to 57.0 mIoU) for semantic segmentation**.
+- July 2021: BEiT-large achieves **state-of-the-art ImageNet top-1 accuracy (88.6%) under the setting without extra data other than ImageNet-22k**.
+- July 2021: release [the code and pretrained models of **BEiT**](https://github.com/microsoft/unilm/tree/master/beit)
+- June 2021: release preprint [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
+## Pretrained models
+We provide BEiT-3 weights pretrained on monomodal and multimodal data. Our large-size model outperforms previous large-size models across various vision-language and vision downstream tasks. The models were pretrained with 224x224 resolution.
+### Tips
+- For vision-language tasks that require deep fusion, we recommend using `BEiT3-base` and `BEiT3-large`.
+- For image-text retrieval or vision tasks, using `BEiT3-base-itc` and `BEiT3-large-itc` usually achieve better performance.
+### Download Checkpoints
+1. Models pretrained on ImageNet-21k images, 160 GB text documents, and web-scale image-text pairs (collected from [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/), [English LAION-2B](https://laion.ai/blog/laion-5b/), [COYO-700M](https://github.com/kakaobrain/coyo-dataset), and CC15M).
+   - [`BEiT3-base`](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth): #layer=12; hidden=768; FFN factor=4x; #head=12; patch=16x16; #parameters: 276M
+   - [`BEiT3-large`](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth): #layer=24; hidden=1024; FFN factor=4x; #head=16; patch=16x16; #parameters: 746M
+2. Perform image-text contrastive intermediate tuning on `BEiT3-base` and `BEiT3-large`.
+   - [`BEiT3-base-itc`](https://github.com/addf400/files/releases/download/beit3/beit3_base_itc_patch16_224.pth): #layer=12; hidden=768; FFN factor=4x; #head=12; patch=16x16; #parameters: 222M
+   - [`BEiT3-large-itc`](https://github.com/addf400/files/releases/download/beit3/beit3_large_itc_patch16_224.pth): #layer=24; hidden=1024; FFN factor=4x; #head=16; patch=16x16; #parameters: 674M
+3. Add indomain image-text pairs (COCO and VG) to continue training `BEiT3-base` and `BEiT3-large` using masked data modeling. The indomain models achieve better performance on VQAv2 and NLVR2 tasks.
+   - [`BEiT3-base-indomain`](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth): #layer=12; hidden=768; FFN factor=4x; #head=12; patch=16x16; #parameters: 276M
+   - [`BEiT3-large-indomain`](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224.pth): #layer=24; hidden=1024; FFN factor=4x; #head=16; patch=16x16; #parameters: 746M
+### Text Tokenizer
+[beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+```
+### Architecture
+We use [Magneto](https://arxiv.org/abs/2210.06423) with decoupled Multiway Transformer as the backbone architecture. Magneto can have better training stability and obtain better performance across modalities (such as vision, and language). The implementation is based on the [torchscale](https://github.com/microsoft/torchscale/blob/main/torchscale/model/BEiT3.py) package.
+## Setup
+```
+alias=`whoami | cut -d'.' -f2`; docker run -it --rm --runtime=nvidia --ipc=host --privileged -v /home/${alias}:/home/${alias} pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel bash
+```
+Clone the repo and install required packages:
+```
+git clone https://github.com/microsoft/unilm.git
+cd unilm/beit3
+pip install -r requirements.txt
+```
+## Fine-tuning on ImageNet-1k (Image Classification)
+The detailed instructions can be found at [`get_started_for_image_classification.md`](get_started/get_started_for_image_classification.md). We only use vision-related parameters for image classification fine-tuning.
+| initialized checkpoint | resolution | acc@1 | acc@5 | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-----:|:-------:|-------------------|
+| [beit3_base_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth) | 224x224 | 85.4 | 97.6 | 87M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224_in1k.pth) |
+| [beit3_base_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth) | 224x224 | 85.4 | 97.6 | 87M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224_in1k.pth) |
+| [beit3_large_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth) | 224x224 | 87.6 | 98.3 | 305M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224_in1k.pth) |
+| [beit3_large_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224.pth) | 224x224 | 87.5 | 98.3 | 305M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224_in1k.pth) |
+## Fine-tuning on VQAv2 (Visual Question Answering)
+The detailed instructions can be found at [`get_started_for_vqav2.md`](get_started/get_started_for_vqav2.md).
+| initialized checkpoint | resolution | augmented data | test-dev | test-std | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-----:|:-----:|:-------:|-------------------|
+| [beit3_base_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth) | 480x480 | - | 77.65 | - | 228M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_480_vqa.pth) |
+| [beit3_base_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth) | 480x480 | - | 78.46 | - | 228M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_480_vqa.pth) |
+| [beit3_large_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth) | 480x480 | - | 81.85 | - | 683M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_480_vqa.pth) |
+| [beit3_large_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224.pth) | 480x480 | - | 82.53 | - | 683M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_480_vqa.pth) |
+| [beit3_large_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224.pth) | 768x768 | VGQA | 82.97 | 83.03 | 684M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_768_vgqaaug_vqa.pth) |
+## Fine-tuning on NLVR2 (Visual Reasoning)
+The detailed instructions can be found at [`get_started_for_nlvr2.md`](get_started/get_started_for_nlvr2.md).
+| initialized checkpoint | resolution | dev | test-P | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-----:|:-------:|-------------------|
+| [beit3_base_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth) | 224x224 | 83.6 | 84.4 | 226M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224_nlvr2.pth) |
+| [beit3_base_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth) | 224x224 | 84.6 | 85.3 | 226M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224_nlvr2.pth) |
+| [beit3_large_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth) | 224x224 | 88.5 | 89.4 | 681M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224_nlvr2.pth) |
+| [beit3_large_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224.pth) | 224x224 | 89.2 | 90.0 | 681M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_indomain_patch16_224_nlvr2.pth) |
+## Fine-tuning on COCO Captioning and NoCaps (Image Captioning)
+The detailed instructions can be found at [`get_started_for_image_captioning.md`](get_started/get_started_for_captioning.md).
+### COCO Captioning
+| initialized checkpoint | resolution | test CIDEr | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-------:|-------------------|
+| [beit3_base_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth) | 480x480 | 133.6 | 271M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_480_coco_captioning.pth) |
+| [beit3_base_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth) | 480x480 | 135.0 | 271M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_480_coco_captioning.pth) |
+| [beit3_large_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth) | 480x480 | 143.2 | 739M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_480_coco_captioning.pth) |
+### NoCaps
+| initialized checkpoint | resolution | val CIDEr | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-------:|-------------------|
+| [beit3_base_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_224.pth) | 480x480 | 104.4 | 271M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_480_nocaps.pth) |
+| [beit3_base_indomain_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_224.pth) | 480x480 | 105.6 | 271M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_indomain_patch16_480_nocaps.pth) |
+| [beit3_large_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_224.pth) | 480x480 | 120.2 | 739M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_480_nocaps.pth) |
+## Fine-tuning on COCO and Flickr30k Retrieval (Image-Text Retrieval)
+The detailed instructions can be found at [`get_started_for_retrieval.md`](get_started/get_started_for_retrieval.md).
+### COCO Retrieval
+| initialized checkpoint | resolution | IR@1 | TR@1 | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-----:|:-------:|-------------------|
+| [beit3_base_itc_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_itc_patch16_224.pth) | 384x384 | 61.4 | 79.1 | 222M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_384_coco_retrieval.pth) |
+| [beit3_large_itc_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_itc_patch16_224.pth) | 384x384 | 63.4 | 82.1 | 675M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_384_coco_retrieval.pth) |
+### Flickr30k Retrieval
+| initialized checkpoint | resolution | IR@1 | TR@1 | #params | weight |
+|:----------------------------------------|:----------:|:-----:|:-----:|:-------:|-------------------|
+| [beit3_base_itc_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_base_itc_patch16_224.pth) | 384x384 | 86.2 | 96.3 | 222M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_base_patch16_384_f30k_retrieval.pth) |
+| [beit3_large_itc_patch16_224](https://github.com/addf400/files/releases/download/beit3/beit3_large_itc_patch16_224.pth) | 384x384 | 88.1 | 97.2 | 675M | [link](https://github.com/addf400/files/releases/download/beit3/beit3_large_patch16_384_f30k_retrieval.pth) |
+## Citation
+If you find this repository useful, please consider citing our work:
+```
+@inproceedings{beit3,
+title={Image as a foreign language: {BEiT} pretraining for vision and vision-language tasks},
+author={Wenhui Wang and Hangbo Bao and Li Dong and Johan Bjorck and Zhiliang Peng and Qiang Liu and Kriti Aggarwal and Owais Khan Mohammed and Saksham Singhal and Subhojit Som and Furu Wei},
+booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+year={2023}
+}
+@article{beitv2,
+title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
+author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
+year={2022},
+eprint={2208.06366},
+archivePrefix={arXiv},
+primaryClass={cs.CV}
+}
+@inproceedings{beit,
+title={{BEiT}: {BERT} Pre-Training of Image Transformers},
+author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
+booktitle={International Conference on Learning Representations},
+year={2022},
+url={https://openreview.net/forum?id=p-BhZSz59o4}
+}
+```
+## Acknowledgement
+This repository is built using the [BEiT](https://github.com/microsoft/unilm/tree/master/beit), the [BEiTv2](https://github.com/microsoft/unilm/tree/master/beit2), the [CLIP](https://github.com/openai/CLIP), the [open_clip](https://github.com/mlfoundations/open_clip), the [Oscar](https://github.com/microsoft/Oscar), the [DeiT](https://github.com/facebookresearch/deit), the [Dino](https://github.com/facebookresearch/dino) repository and the [timm](https://github.com/rwightman/pytorch-image-models) library.
+## License
+This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
+### Contact Information
+For help or issues using BEiT-3 models, please submit a GitHub issue.

model/unilm/beit3/datasets.py ADDED Viewed

	@@ -0,0 +1,847 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import os
+import json
+import random
+import torch
+import glob
+from collections import defaultdict, Counter
+from torchvision import transforms
+from torchvision.datasets.folder import default_loader
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from timm.data.transforms import RandomResizedCropAndInterpolation
+from timm.data import create_transform
+import utils
+from glossary import normalize_word
+from randaug import RandomAugment
+class BaseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self, data_path, split, transform,
+        tokenizer, num_max_bpe_tokens, task=None,
+    ):
+        index_files = self.get_index_files(split, task=task)
+        self.tokenizer = tokenizer
+        self.num_max_bpe_tokens = num_max_bpe_tokens
+        self.data_path = data_path
+        items = []
+        self.index_files = index_files
+        offset = 0
+        for _index_file in index_files:
+            index_file = os.path.join(data_path, _index_file)
+            with open(index_file, mode="r", encoding="utf-8") as reader:
+                for line in reader:
+                    data = json.loads(line)
+                    items.append(data)
+                print("Load %d image-text pairs from %s. " % (len(items) - offset, index_file))
+                offset = len(items)
+        self.items = items
+        self.bos_token_id = tokenizer.bos_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+        self.loader = default_loader
+        self.transform = transform
+        self.split = split
+    @staticmethod
+    def get_index_files(split):
+        raise NotImplementedError()
+    def _get_image(self, image_path: str):
+        image_path = os.path.join(self.data_path, image_path)
+        image = self.loader(image_path)
+        return self.transform(image)
+    def _get_text_segment(self, text_segment, max_len=None):
+        if isinstance(text_segment, str):
+            tokens = self.tokenizer.tokenize(text_segment)
+        else:
+            tokens = text_segment[:]
+        if len(tokens) == 0:
+            raise RuntimeError("The text segment should contains at least one tokens!")
+        if max_len is None:
+            max_len = self.num_max_bpe_tokens
+        if len(tokens) > max_len - 2:
+            tokens = tokens[:max_len - 2]
+        tokens = [self.bos_token_id] + tokens[:] + [self.eos_token_id]
+        num_tokens = len(tokens)
+        padding_mask = [0] * num_tokens + [1] * (max_len - num_tokens)
+        return tokens + [self.pad_token_id] * (max_len - num_tokens), padding_mask, num_tokens
+    def _get_image_text_example(self, index: int, data: dict):
+        item = self.items[index]
+        img_path = item["image_path"]
+        img = self._get_image(img_path)
+        data["image"] = img
+        text_segment = item["text_segment"]
+        language_tokens, padding_mask, _ = self._get_text_segment(text_segment)
+        data["language_tokens"] = language_tokens
+        data["padding_mask"] = padding_mask
+    def __getitem__(self, index: int):
+        data = dict()
+        self._get_image_text_example(index, data)
+        return data
+    def __len__(self) -> int:
+        return len(self.items)
+    def __repr__(self) -> str:
+        head = "Dataset " + self.__class__.__name__
+        body = '{' + "\n  Number of items: %s," % self.__len__()
+        body += "\n  data root = %s," % self.data_path
+        body += "\n  split = %s," % self.split
+        body += "\n  dataset index files = %s" % str(self.index_files)
+        body += "\n  num max bpe tokens = %s" % self.num_max_bpe_tokens
+        body += "\n  transforms = ["
+        for t in self.transform.transforms:
+            body += "\n    %s" % str(t)
+        body += "\n  ]"
+        body += "\n}"
+        return head + body
+def _write_data_into_jsonl(items, jsonl_file):
+    with open(jsonl_file, mode="w", encoding="utf-8") as writer:
+        for data in items:
+            writer.write(json.dumps(data, indent=None))
+            writer.write('\n')
+    print("Write %s with %d items !" % (jsonl_file, len(items)))
+def _make_retrieval_coco_karpathy_dataset_index(
+        data_path,
+        tokenizer,
+        split=("train", "restval"),
+        split_name="train",
+):
+    coco_karpathy_split_json_file = os.path.join(data_path, "dataset_coco.json")
+    items = []
+    image_counter = set()
+    print("read %s" % coco_karpathy_split_json_file)
+    with open(coco_karpathy_split_json_file, mode="r", encoding="utf-8") as reader:
+        data = json.loads(reader.read())
+        for item in data["images"]:
+            if item["split"] in split:
+                image_path = os.path.join(item["filepath"], item["filename"])
+                for sent in item["sentences"]:
+                    tokens = tokenizer.tokenize(sent["raw"])
+                    token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                    items.append({
+                            "image_path": image_path,
+                            "text_segment": token_ids,
+                            "image_id": len(image_counter),
+                    })
+                if image_path not in image_counter:
+                    image_counter.add(image_path)
+    print("Find %d images and %d image-text pairs for karpathy dataset %s split !" % \
+        (len(image_counter), len(items), split_name))
+    index_file = os.path.join(data_path, "coco_retrieval.%s.jsonl" % split_name)
+    _write_data_into_jsonl(items, index_file)
+    pass
+def _make_captioning_coco_karpathy_dataset_index(
+        data_path,
+        tokenizer,
+        split=("train", "restval"),
+        split_name="train",
+):
+    coco_karpathy_split_json_file = os.path.join(data_path, "dataset_coco.json")
+    items = []
+    image_counter = set()
+    print("read %s" % coco_karpathy_split_json_file)
+    with open(coco_karpathy_split_json_file, mode="r", encoding="utf-8") as reader:
+        data = json.loads(reader.read())
+        for item in data["images"]:
+            if item["split"] in split:
+                image_path = os.path.join(item["filepath"], item["filename"])
+                if item["split"] in ["train", "restval"]:
+                    for sent in item["sentences"]:
+                        tokens = tokenizer.tokenize(sent["raw"])
+                        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                        items.append({
+                                "image_path": image_path,
+                                "text_segment": token_ids,
+                                "image_id": item["cocoid"],
+                        })
+                else:
+                    items.append({
+                                "image_path": image_path,
+                                "text_segment": None,
+                                "image_id": item["cocoid"],
+                    })
+                if image_path not in image_counter:
+                    image_counter.add(image_path)
+    print("Find %d images and %d image-text pairs for karpathy dataset %s split !" % \
+        (len(image_counter), len(items), split_name))
+    index_file = os.path.join(data_path, "coco_captioning.%s.jsonl" % split_name)
+    _write_data_into_jsonl(items, index_file)
+    pass
+def _make_nocaps_dataset_index(
+        data_path,
+        split="val",
+):
+    if split == "val":
+        json_file = "nocaps_val_4500_captions.json"
+    elif split == "test":
+        json_file = "nocaps_test_image_info.json"
+    nocaps_split_json_file = os.path.join(data_path, json_file)
+    items = []
+    image_counter = set()
+    print("read %s" % nocaps_split_json_file)
+    with open(nocaps_split_json_file, mode="r", encoding="utf-8") as reader:
+        data = json.loads(reader.read())
+        for item in data["images"]:
+            image_path = os.path.join(split, item["file_name"])
+            items.append({
+                "image_path": image_path,
+                "text_segment": None,
+                "image_id": item["id"],
+            })
+            if image_path not in image_counter:
+                image_counter.add(image_path)
+    print("Find %d images and %d image-text pairs for nocaps dataset %s split !" % \
+        (len(image_counter), len(items), split))
+    index_file = os.path.join(data_path, "nocaps.%s.jsonl" % split)
+    _write_data_into_jsonl(items, index_file)
+class NLVR2Dataset(BaseDataset):
+    @staticmethod
+    def get_index_files(split, task=None):
+        if split == "train":
+            return ("nlvr2.train.index.jsonl", )
+        elif split == "val":
+            return ("nlvr2.dev.index.jsonl", )
+        elif split == "test":
+            return ("nlvr2.test-P.index.jsonl", )
+        else:
+            raise RuntimeError("split %s is not found!" % split)
+    def __getitem__(self, index: int):
+        data = super().__getitem__(index)
+        item = self.items[index]
+        img_path = item["image2_path"]
+        img = self._get_image(img_path)
+        data["image2"] = img
+        data["label"] = self.items[index]["label"]
+        return data
+    @staticmethod
+    def __preprocess_json(preifx, json_file, tokenizer, index_file):
+        items = []
+        with open(json_file, mode="r", encoding="utf-8") as reader:
+            for line in reader:
+                data = json.loads(line)
+                path = os.path.join(preifx, str(data["directory"])) if "directory" in data else preifx
+                path = os.path.join(path, "-".join(data["identifier"].split("-")[:-1]))
+                tokens = tokenizer.tokenize(data["sentence"])
+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                items.append({
+                    "image_path": path + "-img0.png",
+                    "image2_path": path + "-img1.png",
+                    "text_segment": token_ids,
+                    "label": 1 if data["label"] == "True" else 0,
+                    "identifier": data["identifier"],
+                })
+        _write_data_into_jsonl(items, index_file)
+    @classmethod
+    def make_dataset_index(cls, data_path, tokenizer, nlvr_repo_path):
+        cls.__preprocess_json(
+            preifx="images/train", json_file=os.path.join(nlvr_repo_path, "nlvr2/data/train.json"),
+            tokenizer=tokenizer, index_file=os.path.join(data_path, cls.get_index_files("train")[0]),
+        )
+        cls.__preprocess_json(
+            preifx="dev", json_file=os.path.join(nlvr_repo_path, "nlvr2/data/dev.json"),
+            tokenizer=tokenizer, index_file=os.path.join(data_path, cls.get_index_files("val")[0]),
+        )
+        cls.__preprocess_json(
+            preifx="test1", json_file=os.path.join(nlvr_repo_path, "nlvr2/data/test1.json"),
+            tokenizer=tokenizer, index_file=os.path.join(data_path, cls.get_index_files("test")[0]),
+        )
+class ImageNetDataset(BaseDataset):
+    @staticmethod
+    def get_index_files(split, task=None):
+        if split == "train":
+            return ("imagenet.train.index.jsonl", )
+        elif split == "val":
+            return ("imagenet.val.index.jsonl", )
+        elif split == "test":
+            return ("imagenet.val.index.jsonl", )
+        else:
+            raise RuntimeError("split %s is not found!" % split)
+    def __getitem__(self, index: int):
+        data = dict()
+        item = self.items[index]
+        img_path = item["image_path"]
+        img = self._get_image(img_path)
+        data["image"] = img
+        data["label"] = item["label"]
+        return data
+    @staticmethod
+    def _find_classes(dir):
+        """
+        Finds the class folders in a dataset.
+        Args:
+            dir (string): Root directory path.
+        Returns:
+            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
+        Ensures:
+            No class is a subdirectory of another.
+        """
+        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+        classes.sort()
+        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_to_idx
+    @staticmethod
+    def _make_imagenet_index(data_path, index_path, data_path_prefix, class_to_idx, split):
+        items = []
+        index_file = os.path.join(index_path, f"imagenet.{split}.index.jsonl")
+        for target_class in sorted(class_to_idx.keys()):
+            class_index = class_to_idx[target_class]
+            target_dir = os.path.join(data_path, target_class)
+            if not os.path.isdir(target_dir):
+                continue
+            for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
+                for fname in sorted(fnames):
+                    path = os.path.join(root, fname)
+                    path = path.replace(data_path_prefix, "")
+                    items.append({
+                        "image_path": path,
+                        "label": class_index,
+                    })
+        _write_data_into_jsonl(items, index_file)
+    @classmethod
+    def make_dataset_index(cls, train_data_path, val_data_path, index_path):
+        data_path_prefix = train_data_path[:[x[0]==x[1] for x in zip(train_data_path, val_data_path)].index(0)]
+        classes, class_to_idx = cls._find_classes(train_data_path)
+        cls._make_imagenet_index(
+             data_path=train_data_path, index_path=index_path, data_path_prefix=data_path_prefix,
+             class_to_idx=class_to_idx, split="train",
+        )
+        cls._make_imagenet_index(
+             data_path=val_data_path, index_path=index_path, data_path_prefix=data_path_prefix,
+             class_to_idx=class_to_idx, split="val",
+        )
+class VQAv2Dataset(BaseDataset):
+    def __init__(self, data_path, **kwargs):
+        super().__init__(data_path=data_path, **kwargs)
+        ans2label_file = os.path.join(data_path, "answer2label.txt")
+        ans2label = {}
+        label2ans = []
+        with open(ans2label_file, mode="r", encoding="utf-8") as reader:
+            for i, line in enumerate(reader):
+                data = json.loads(line)
+                ans = data["answer"]
+                label = data["label"]
+                label = int(label)
+                assert label == i
+                ans2label[ans] = i
+                label2ans.append(ans)
+        self.ans2label = ans2label
+        self.label2ans = label2ans
+    @staticmethod
+    def get_index_files(split, task=None):
+        if split == "train":
+            return ("vqa.train.jsonl", "vqa.trainable_val.jsonl")
+        elif split == "val":
+            return ("vqa.rest_val.jsonl", )
+        elif split == "test":
+            return ("vqa.test.jsonl", )
+        elif split == "test-dev":
+            return ("vqa.test-dev.jsonl", )
+        else:
+            raise RuntimeError("split %s is not found!" % split)
+    def __getitem__(self, index: int):
+        data = super().__getitem__(index)
+        if "labels" in self.items[index] and len(self.items[index]["labels"]) > 0:
+            labels = [0.] * len(self.label2ans)
+            for l, s in zip(self.items[index]["labels"], self.items[index]["scores"]):
+                labels[l] = s
+            data["labels"] = torch.FloatTensor(labels)
+        else:
+            data["qid"] = self.items[index]["qid"]
+        return data
+    @staticmethod
+    def get_score(occurences):
+        if occurences == 0:
+            return 0.0
+        elif occurences == 1:
+            return 0.3
+        elif occurences == 2:
+            return 0.6
+        elif occurences == 3:
+            return 0.9
+        else:
+            return 1.0
+    @classmethod
+    def make_dataset_index(cls, data_path, tokenizer, annotation_data_path):
+        with open(os.path.join(annotation_data_path, "v2_OpenEnded_mscoco_train2014_questions.json"), "r") as fp:
+            questions_train2014 = json.load(fp)["questions"]
+        with open(os.path.join(annotation_data_path, "v2_OpenEnded_mscoco_val2014_questions.json"), "r") as fp:
+            questions_val2014 = json.load(fp)["questions"]
+        with open(os.path.join(annotation_data_path, "v2_OpenEnded_mscoco_test2015_questions.json"), "r") as fp:
+            questions_test2015 = json.load(fp)["questions"]
+        with open(os.path.join(annotation_data_path, "v2_OpenEnded_mscoco_test-dev2015_questions.json"), "r") as fp:
+            questions_test_dev2015 = json.load(fp)["questions"]
+        with open(os.path.join(annotation_data_path, "v2_mscoco_train2014_annotations.json"), "r") as fp:
+            annotations_train2014 = json.load(fp)["annotations"]
+        with open(os.path.join(annotation_data_path, "v2_mscoco_val2014_annotations.json"), "r") as fp:
+            annotations_val2014 = json.load(fp)["annotations"]
+        annotations = dict()
+        for split, questions in zip(
+            ["train", "val", "test", "test-dev"],
+            [questions_train2014, questions_val2014, questions_test2015, questions_test_dev2015],
+        ):
+            _annot = defaultdict(dict)
+            for q in questions:
+                question_text = q["question"]
+                tokens = tokenizer.tokenize(question_text)
+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                assert q["question_id"] not in _annot[q["image_id"]]
+                _annot[q["image_id"]][q["question_id"]] = {
+                    "question": question_text,
+                    "token_ids": token_ids,
+                }
+            annotations[split] = _annot
+        all_major_answers = list()
+        for split, annots in zip(
+            ["train", "val"], [annotations_train2014, annotations_val2014],
+        ):
+            # _annot = annotations[split]
+            for q in annots:
+                all_major_answers.append(q["multiple_choice_answer"])
+        all_major_answers = [normalize_word(word) for word in all_major_answers]
+        counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9}
+        ans2label = {k: i for i, k in enumerate(counter.keys())}
+        label2ans = list(counter.keys())
+        for split, annots in zip(
+            ["train", "val"], [annotations_train2014, annotations_val2014],
+        ):
+            _annot = annotations[split]
+            for q in annots:
+                answers = q["answers"]
+                answer_count = {}
+                for answer in answers:
+                    answer_ = answer["answer"]
+                    answer_count[answer_] = answer_count.get(answer_, 0) + 1
+                labels = []
+                scores = []
+                for answer in answer_count:
+                    if answer not in ans2label:
+                        continue
+                    labels.append(ans2label[answer])
+                    score = cls.get_score(answer_count[answer])
+                    scores.append(score)
+                assert "labels" not in _annot[q["image_id"]][q["question_id"]]
+                assert "question" in _annot[q["image_id"]][q["question_id"]]
+                _annot[q["image_id"]][q["question_id"]]["labels"] = labels
+                _annot[q["image_id"]][q["question_id"]]["scores"] = scores
+        for split in ["train", "val"]:
+            filtered_annot = dict()
+            for ik, iv in annotations[split].items():
+                new_q = dict()
+                for qk, qv in iv.items():
+                    if len(qv["labels"]) != 0:
+                        new_q[qk] = qv
+                if len(new_q) != 0:
+                    filtered_annot[ik] = new_q
+            annotations[split] = filtered_annot
+        split2items = {}
+        for split in ["train", "val", "test", "test-dev"]:
+            annot = annotations[split]
+            split_name = {
+                "train": "train2014",
+                "val": "val2014",
+                "test": "test2015",
+                "test-dev": "test2015",
+            }[split]
+            paths = list(glob.glob(f"{data_path}/{split_name}/*.jpg"))
+            random.shuffle(paths)
+            annot_paths = [path for path in paths \
+                if int(path.split("/")[-1].split("_")[-1][:-4]) in annot]
+            if len(paths) == len(annot_paths):
+                print("all images have caption annotations")
+            else:
+                print("not all images have caption annotations")
+            print(len(paths), len(annot_paths), len(annot))
+            items = []
+            for path in annot_paths:
+                iid = int(path.split("/")[-1].split("_")[-1][:-4])
+                _annot = annotations[split][iid]
+                for qid in _annot:
+                    q = _annot[qid]
+                    if split in ["train", "val"]:
+                        labels = q["labels"]
+                        scores = q["scores"]
+                    else:
+                        labels, scores = [], []
+                    items.append({
+                        "image_path": os.path.join(split_name, path.split('/')[-1]),
+                        "text_segment": q["token_ids"],
+                        "labels": labels,
+                        "scores": scores,
+                        "qid": qid,
+                    })
+            split2items[split] = items
+            _write_data_into_jsonl(items=items, jsonl_file=os.path.join(data_path, "vqa.%s.jsonl" % split))
+        # Following ViLT, we use 1000 images of the original val set as the final val set
+        val_image2items = defaultdict(list)
+        for item in split2items["val"]:
+            val_image2items[item["image_path"]].append(item)
+        print("Contains %d image and %d pairs for val set!" % (len(val_image2items), len(split2items["val"])))
+        val_images = list(val_image2items.keys())
+        random.shuffle(val_images)
+        trainable_val = []
+        rest_val = []
+        for i, image_id in enumerate(val_images):
+            if i < 1000:
+                rest_val += val_image2items[image_id]
+            else:
+                trainable_val += val_image2items[image_id]
+        _write_data_into_jsonl(items=trainable_val, jsonl_file=os.path.join(data_path, "vqa.trainable_val.jsonl"))
+        _write_data_into_jsonl(items=rest_val, jsonl_file=os.path.join(data_path, "vqa.rest_val.jsonl"))
+        with open(os.path.join(data_path, "answer2label.txt"), mode="w", encoding="utf-8") as writer:
+            for ans in ans2label:
+                to_json = {
+                    "answer": ans,
+                    "label": ans2label[ans]
+                }
+                writer.write("%s\n" % json.dumps(to_json))
+class RetrievalDataset(BaseDataset):
+    @staticmethod
+    def get_index_files(split, task=None):
+        if split == "train":
+            return (f"{task}.train.jsonl", )
+        elif split == "val":
+            return (f"{task}.val.jsonl", )
+        elif split == "test":
+            return (f"{task}.test.jsonl", )
+        else:
+            raise RuntimeError("split %s is not found!" % split)
+    def __getitem__(self, index: int):
+        data = super().__getitem__(index)
+        data["image_id"] = self.items[index]["image_id"]
+        return data
+    @staticmethod
+    def make_flickr30k_dataset_index(data_path, tokenizer, karpathy_path):
+        with open(os.path.join(karpathy_path, "dataset_flickr30k.json"), "r") as reader:
+            captions = json.loads(reader.read())
+        captions = captions["images"]
+        split2items = defaultdict(list)
+        split2images = defaultdict(set)
+        for each_item in captions:
+            image_path = os.path.join("flickr30k-images", each_item["filename"])
+            split = each_item["split"]
+            for text_segment in each_item["sentences"]:
+                tokens = tokenizer.tokenize(text_segment["raw"])
+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                split2items[split].append({
+                    "image_path": image_path,
+                    "text_segment": token_ids,
+                    "image_id": len(split2images[split]),
+                })
+            assert each_item["filename"] not in split2images[split]
+            split2images[split].add(each_item["filename"])
+        for split in split2items:
+            print("%d images and %d image-text pairs!" % (len(split2images[split]), len(split2items[split])))
+            _write_data_into_jsonl(split2items[split], os.path.join(data_path, "flickr30k.%s.jsonl" % split))
+    @staticmethod
+    def make_coco_dataset_index(data_path, tokenizer):
+        _make_retrieval_coco_karpathy_dataset_index(data_path, tokenizer, split=("train", "restval"), split_name="train")
+        _make_retrieval_coco_karpathy_dataset_index(data_path, tokenizer, split=("val", ), split_name="val")
+        _make_retrieval_coco_karpathy_dataset_index(data_path, tokenizer, split=("test", ), split_name="test")
+class CaptioningDataset(BaseDataset):
+    def __init__(self, data_path, split, transform,
+                tokenizer, num_max_bpe_tokens, task, mask_prob):
+        super().__init__(
+            data_path=data_path, split=split,
+            transform=transform, tokenizer=tokenizer,
+            num_max_bpe_tokens=num_max_bpe_tokens, task=task,
+        )
+        self.mask_token_id = tokenizer.mask_token_id
+        self.language_vocab_size = tokenizer.vocab_size
+        self.mask_prob = mask_prob
+    @staticmethod
+    def get_index_files(split, task=None):
+        if split == "train":
+            return ("coco_captioning.train.jsonl", )
+        elif split == "val":
+            return (f"{task}.val.jsonl", )
+        elif split == "test":
+            return (f"{task}.test.jsonl", )
+        else:
+            raise RuntimeError("split %s is not found!" % split)
+    def _get_mask_token(self, token):
+        p = random.random()
+        if p < 0.8:
+            return self.mask_token_id
+        elif p < 0.9:
+            return token
+        else:
+            return random.randint(3, self.language_vocab_size - 1)
+    def _masking_on_text_tokens(self, tokens, num_tokens, mask_prob):
+        bool_masked_pos = [0] * len(tokens)
+        to_mask = min(int(num_tokens * mask_prob + 0.5), num_tokens - 1)
+        to_mask = max(to_mask, 1)
+        num_masked_tokens = 0
+        while num_masked_tokens < to_mask:
+            i = random.randint(1, num_tokens - 1)
+            if bool_masked_pos[i] == 0:
+                bool_masked_pos[i] = 1
+                tokens[i] = self._get_mask_token(tokens[i])
+                num_masked_tokens += 1
+        return tokens, bool_masked_pos
+    def __getitem__(self, index: int):
+        data = dict()
+        item = self.items[index]
+        img_path = item["image_path"]
+        img = self._get_image(img_path)
+        data["image"] = img
+        data["image_id"] = item["image_id"]
+        text_segment = item["text_segment"]
+        if text_segment is not None:
+            language_tokens, padding_mask, num_tokens = self._get_text_segment(text_segment)
+            masked_tokens = language_tokens[:]
+            masked_tokens, language_masked_pos = \
+                self._masking_on_text_tokens(masked_tokens, num_tokens, self.mask_prob)
+            data["language_tokens"] = language_tokens
+            data["masked_tokens"] = masked_tokens
+            data["language_masked_pos"] = language_masked_pos
+            data["padding_mask"] = padding_mask
+        return data
+    @staticmethod
+    def make_coco_captioning_dataset_index(data_path, tokenizer):
+        _make_captioning_coco_karpathy_dataset_index(data_path, tokenizer, split=("train", "restval"), split_name="train")
+        _make_captioning_coco_karpathy_dataset_index(data_path, tokenizer, split=("val", ), split_name="val")
+        _make_captioning_coco_karpathy_dataset_index(data_path, tokenizer, split=("test", ), split_name="test")
+    @staticmethod
+    def make_nocaps_captioning_dataset_index(data_path):
+        _make_nocaps_dataset_index(data_path, split="val")
+        _make_nocaps_dataset_index(data_path, split="test")
+task2dataset = {
+    "nlvr2": NLVR2Dataset,
+    "vqav2": VQAv2Dataset,
+    "flickr30k": RetrievalDataset,
+    "coco_retrieval": RetrievalDataset,
+    "coco_captioning": CaptioningDataset,
+    "nocaps": CaptioningDataset,
+    "imagenet": ImageNetDataset,
+}
+def create_dataloader(dataset, is_train, batch_size, num_workers, pin_mem, dist_eval=False):
+    if is_train or dist_eval:
+        num_tasks = utils.get_world_size()
+        global_rank = utils.get_rank()
+        if not is_train and dist_eval and len(dataset) % num_tasks != 0:
+            print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                    'equal num of samples per-process.')
+        sampler = torch.utils.data.DistributedSampler(
+            dataset, num_replicas=num_tasks, rank=global_rank, shuffle=is_train
+        )
+    else:
+        sampler = torch.utils.data.SequentialSampler(dataset)
+    return torch.utils.data.DataLoader(
+        dataset, sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=is_train,
+        collate_fn=utils.merge_batch_tensors_by_dict_key,
+    )
+def build_transform(is_train, args):
+    if args.task in ["imagenet"]:
+        return build_imagenet_transform(is_train, args)
+    if is_train:
+        t = [
+            RandomResizedCropAndInterpolation(args.input_size, scale=(0.5, 1.0), interpolation=args.train_interpolation),
+            transforms.RandomHorizontalFlip(),
+        ]
+        if args.randaug:
+            t.append(
+                RandomAugment(
+                    2, 7, isPIL=True,
+                    augs=[
+                        'Identity','AutoContrast','Equalize','Brightness','Sharpness',
+                        'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate',
+                    ]))
+        t += [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+        ]
+        t = transforms.Compose(t)
+    else:
+        t = transforms.Compose([
+            transforms.Resize((args.input_size, args.input_size), interpolation=3),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)
+        ])
+    return t
+def build_imagenet_transform(is_train, args):
+    resize_im = args.input_size > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation=args.train_interpolation,
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+            mean=IMAGENET_DEFAULT_MEAN,
+            std=IMAGENET_DEFAULT_STD,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(
+                args.input_size, padding=4)
+        return transform
+    t = []
+    if resize_im:
+        if args.crop_pct is None:
+            args.crop_pct = 1.0
+        size = int(args.input_size / args.crop_pct)
+        t.append(
+            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
+        )
+        t.append(transforms.CenterCrop(args.input_size))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)
+def get_sentencepiece_model_for_beit3(args):
+    from transformers import XLMRobertaTokenizer
+    return XLMRobertaTokenizer(args.sentencepiece_model)
+def create_dataset_by_split(args, split, is_train=True):
+    transform = build_transform(is_train=is_train, args=args)
+    dataset_class = task2dataset[args.task]
+    tokenizer = get_sentencepiece_model_for_beit3(args)
+    opt_kwargs = {}
+    if args.task in ["coco_captioning", "nocaps"]:
+        opt_kwargs["mask_prob"] = args.captioning_mask_prob
+    dataset = dataset_class(
+        data_path=args.data_path, split=split,
+        transform=transform, tokenizer=tokenizer,
+        num_max_bpe_tokens=args.num_max_bpe_tokens,
+        task=args.task, **opt_kwargs,
+    )
+    if is_train:
+        batch_size = args.batch_size
+    elif hasattr(args, "eval_batch_size") and args.eval_batch_size is not None:
+        batch_size = args.eval_batch_size
+    else:
+        batch_size = int(args.batch_size * 1.5)
+    return create_dataloader(
+        dataset, is_train=is_train, batch_size=batch_size,
+        num_workers=args.num_workers, pin_mem=args.pin_mem, dist_eval=args.dist_eval,
+    )
+def create_downstream_dataset(args, is_eval=False):
+    if is_eval:
+        return create_dataset_by_split(args, split="test", is_train=False)
+    else:
+        return \
+            create_dataset_by_split(args, split="train", is_train=True), \
+            create_dataset_by_split(args, split="val", is_train=True)

model/unilm/beit3/engine_for_finetuning.py ADDED Viewed

	@@ -0,0 +1,598 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import math
+import sys
+import json
+from typing import Iterable, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.utils import ModelEma
+from timm.utils import accuracy, ModelEma
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from datasets import get_sentencepiece_model_for_beit3
+import utils
+class TaskHandler(object):
+    def __init__(self) -> None:
+        self.metric_logger = None
+        self.split = None
+    def train_batch(self, model, **kwargs):
+        raise NotImplementedError()
+    def eval_batch(self, model, **kwargs):
+        raise NotImplementedError()
+    def before_eval(self, metric_logger, data_loader, **kwargs):
+        self.metric_logger = metric_logger
+        self.split = data_loader.dataset.split
+    def after_eval(self, **kwargs):
+        raise NotImplementedError()
+class NLVR2Handler(TaskHandler):
+    def __init__(self) -> None:
+        super().__init__()
+        self.criterion = torch.nn.CrossEntropyLoss()
+    def train_batch(self, model, image, image2, language_tokens, padding_mask, label):
+        logits = model(
+            image_a=image, image_b=image2,
+            text_description=language_tokens,
+            padding_mask=padding_mask)
+        acc = (logits.max(-1)[-1] == label).float().mean()
+        return {
+            "loss": self.criterion(input=logits, target=label),
+            "acc": acc,
+        }
+    def eval_batch(self, model, image, image2, language_tokens, padding_mask, label):
+        logits = model(
+            image_a=image, image_b=image2,
+            text_description=language_tokens,
+            padding_mask=padding_mask)
+        batch_size = language_tokens.shape[0]
+        acc = (logits.max(-1)[-1] == label).float().sum(0) * 100.0 / batch_size
+        self.metric_logger.meters['acc'].update(acc.item(), n=batch_size)
+    def after_eval(self, **kwargs):
+        print('* Acc {acc.global_avg:.3f}'.format(acc=self.metric_logger.acc))
+        return {k: meter.global_avg for k, meter in self.metric_logger.meters.items()}, "acc"
+class ImageNetHandler(TaskHandler):
+    def __init__(self, args) -> None:
+        super().__init__()
+        mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+        if mixup_active:
+            # smoothing is handled with mixup label transform
+            self.criterion = SoftTargetCrossEntropy()
+        elif args.label_smoothing > 0.:
+            self.criterion = LabelSmoothingCrossEntropy(smoothing=args.label_smoothing)
+        else:
+            self.criterion = torch.nn.CrossEntropyLoss()
+    def train_batch(self, model, image, label):
+        logits = model(image=image)
+        return {
+            "loss": self.criterion(logits, label),
+        }
+    def eval_batch(self, model, image, label):
+        logits = model(image=image)
+        batch_size = image.shape[0]
+        acc1, acc5 = accuracy(logits, label, topk=(1, 5))
+        self.metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+        self.metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+    def after_eval(self, **kwargs):
+        print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'
+            .format(top1=self.metric_logger.acc1, top5=self.metric_logger.acc5))
+        return {k: meter.global_avg for k, meter in self.metric_logger.meters.items()}, "acc1"
+class RetrievalHandler(TaskHandler):
+    def __init__(self) -> None:
+        super().__init__()
+        self.image_feats = []
+        self.text_feats = []
+        self.image_ids = []
+        self.metric_logger = None
+    def train_batch(self, model, image, language_tokens, padding_mask, image_id):
+        loss, vision_cls, language_cls = model(
+            image=image, text_description=language_tokens, padding_mask=padding_mask)
+        return {
+            "loss": loss,
+        }
+    def before_eval(self, metric_logger, **kwargs):
+        self.image_feats.clear()
+        self.text_feats.clear()
+        self.image_ids.clear()
+        self.metric_logger = metric_logger
+    def eval_batch(self, model, image, language_tokens, padding_mask, image_id):
+        vision_cls, _ = model(image=image, only_infer=True)
+        _, language_cls = model(
+            text_description=language_tokens, padding_mask=padding_mask, only_infer=True)
+        self.image_feats.append(vision_cls.clone())
+        self.text_feats.append(language_cls.clone())
+        self.image_ids.append(image_id.clone())
+    def after_eval(self, **kwargs):
+        image_feats = {}
+        for feats, ids in zip(self.image_feats, self.image_ids):
+            for i, _idx in enumerate(ids):
+                idx = _idx.item()
+                if idx not in image_feats:
+                    image_feats[idx] = feats[i]
+        tiids = torch.cat(self.image_ids, dim=0)
+        iids = []
+        sorted_tensors = []
+        for key in sorted(image_feats.keys()):
+            sorted_tensors.append(image_feats[key].view(1, -1))
+            iids.append(key)
+        image_cls_feats = torch.cat(sorted_tensors, dim=0)
+        text_cls_feats = torch.cat(self.text_feats, dim=0)
+        scores = image_cls_feats @ text_cls_feats.t()
+        iids = torch.LongTensor(iids).to(scores.device)
+        print("scores: {}".format(scores.size()))
+        print("iids: {}".format(iids.size()))
+        print("tiids: {}".format(tiids.size()))
+        topk10 = scores.topk(10, dim=1)
+        topk5 = scores.topk(5, dim=1)
+        topk1 = scores.topk(1, dim=1)
+        topk10_iids = tiids[topk10.indices]
+        topk5_iids = tiids[topk5.indices]
+        topk1_iids = tiids[topk1.indices]
+        tr_r10 = (iids.unsqueeze(1) == topk10_iids).float().max(dim=1)[0].mean()
+        tr_r5 = (iids.unsqueeze(1) == topk5_iids).float().max(dim=1)[0].mean()
+        tr_r1 = (iids.unsqueeze(1) == topk1_iids).float().max(dim=1)[0].mean()
+        topk10 = scores.topk(10, dim=0)
+        topk5 = scores.topk(5, dim=0)
+        topk1 = scores.topk(1, dim=0)
+        topk10_iids = iids[topk10.indices]
+        topk5_iids = iids[topk5.indices]
+        topk1_iids = iids[topk1.indices]
+        ir_r10 = (tiids.unsqueeze(0) == topk10_iids).float().max(dim=0)[0].mean()
+        ir_r5 = (tiids.unsqueeze(0) == topk5_iids).float().max(dim=0)[0].mean()
+        ir_r1 = (tiids.unsqueeze(0) == topk1_iids).float().max(dim=0)[0].mean()
+        eval_result = {
+            "tr_r10": tr_r10.item() * 100.0,
+            "tr_r5": tr_r5.item() * 100.0,
+            "tr_r1": tr_r1.item() * 100.0,
+            "ir_r10": ir_r10.item() * 100.0,
+            "ir_r5": ir_r5.item() * 100.0,
+            "ir_r1": ir_r1.item() * 100.0,
+            "average_score": 100.0 * (tr_r1 + tr_r5 + tr_r10 + ir_r1 + ir_r5 + ir_r10).item() / 6.0,
+        }
+        print('* Eval result = %s' % json.dumps(eval_result))
+        return eval_result, "average_score"
+class VQAHandler(TaskHandler):
+    def __init__(self) -> None:
+        super().__init__()
+        self.predictions = []
+        self.criterion = nn.BCEWithLogitsLoss(reduction='mean')
+        self.label2ans = None
+    def train_batch(self, model, image, language_tokens, padding_mask, labels):
+        logits = model(
+            image=image, question=language_tokens,
+            padding_mask=padding_mask)
+        return {
+            "loss": self.criterion(input=logits.float(), target=labels.float()) * labels.shape[1],
+        }
+    def before_eval(self, metric_logger, data_loader, **kwargs):
+        self.predictions.clear()
+        self.metric_logger = metric_logger
+        self.label2ans = data_loader.dataset.label2ans
+    def eval_batch(self, model, image, language_tokens, padding_mask, labels=None, qid=None):
+        logits = model(
+            image=image, question=language_tokens,
+            padding_mask=padding_mask)
+        batch_size = language_tokens.shape[0]
+        if labels is not None:
+            scores = utils.VQAScore()(logits, labels) * 100.0
+            self.metric_logger.meters['score'].update(scores.item(), n=batch_size)
+        else:
+            _, preds = logits.max(-1)
+            for image_id, pred in zip(qid, preds):
+                self.predictions.append({
+                    "question_id": image_id.item(),
+                    "answer": self.label2ans[pred.item()],
+                })
+    def after_eval(self, **kwargs):
+        if len(self.predictions) == 0:
+            print('* Score {score.global_avg:.3f}'.format(score=self.metric_logger.score))
+            return {k: meter.global_avg for k, meter in self.metric_logger.meters.items()}, "score"
+        else:
+            return self.predictions, "prediction"
+class CaptioningHandler(TaskHandler):
+    def __init__(self, args) -> None:
+        super().__init__()
+        self.predictions = []
+        self.criterion = utils.BertCaptioningLoss(args.label_smoothing, args.drop_worst_ratio, args.drop_worst_after)
+        self.tokenizer = get_sentencepiece_model_for_beit3(args)
+        self.num_beams = args.num_beams
+        self.max_len = args.num_max_bpe_tokens
+        self.length_penalty = args.length_penalty
+        self.vocab_size = args.vocab_size
+    def train_batch(self, model, image, language_tokens, masked_tokens, language_masked_pos, padding_mask, image_id, global_step):
+        logits, _ = model(
+            image=image, text_ids=masked_tokens, padding_mask=padding_mask, language_masked_pos=language_masked_pos, image_id=image_id)
+        masked_labels = language_tokens[language_masked_pos.bool()]
+        score = torch.max(logits, -1)[1].data == masked_labels
+        acc = torch.sum(score.float()) / torch.sum(language_masked_pos)
+        return {
+            "loss": self.criterion(logits, masked_labels, global_step),
+            "acc": acc
+        }
+    def before_eval(self, metric_logger, data_loader, **kwargs):
+        self.predictions.clear()
+        self.metric_logger = metric_logger
+    def eval_batch(self, model, image, image_id=None):
+        cur_len = 2
+        num_keep_best = 1
+        TOPN_PER_BEAM = 3
+        batch_size = image.size(0)
+        mask_id = self.tokenizer.mask_token_id
+        cls_id = self.tokenizer.cls_token_id
+        pad_id = self.tokenizer.pad_token_id
+        sep_id = self.tokenizer.sep_token_id
+        eos_token_ids = [sep_id]
+        cls_ids = torch.full(
+            (batch_size, 1), cls_id, dtype=torch.long, device=image.device
+        )
+        mask_ids = torch.full(
+            (batch_size, 1), mask_id, dtype=torch.long, device=image.device
+        )
+        cur_input_ids = torch.cat([cls_ids, mask_ids], dim=1)
+        tmp_ids = torch.full(
+            (batch_size, self.max_len-1), mask_id, dtype=torch.long, device=image.device
+        )
+        decoding_results = torch.cat([cls_ids, tmp_ids], dim=1)
+        # Expand input to num beams
+        cur_input_ids = cur_input_ids.unsqueeze(1).expand(batch_size, self.num_beams, cur_len)
+        cur_input_ids = cur_input_ids.contiguous().view(batch_size * self.num_beams, cur_len)  # (batch_size * num_beams, cur_len)
+        decoding_results = decoding_results.unsqueeze(1).expand(batch_size, self.num_beams, self.max_len)
+        decoding_results = decoding_results.contiguous().view(batch_size * self.num_beams, self.max_len)  # (batch_size * num_beams, cur_len)
+        image = image.unsqueeze(1).expand(batch_size, self.num_beams, image.size(-3), image.size(-2), image.size(-1))
+        image = image.contiguous().view(batch_size * self.num_beams, image.size(-3), image.size(-2), image.size(-1))
+        generated_hyps = [
+            utils.BeamHypotheses(
+                num_keep_best, self.max_len, length_penalty=self.length_penalty, early_stopping=False
+            ) for _ in range(batch_size)
+        ]
+        # scores for each sentence in the beam
+        beam_scores = torch.zeros((batch_size, self.num_beams), dtype=torch.float, device=cur_input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
+        # done sentences
+        done = [False for _ in range(batch_size)]
+        incremental_state = {}
+        while cur_len <= self.max_len:
+            next_token_idx = 1
+            padding_masks = torch.full(
+                cur_input_ids.shape, 0, dtype=torch.long, device=image.device
+            )
+            input_image = image
+            if cur_len != 2:
+                input_image = None
+            outputs, incremental_state_next = model(
+                image=input_image, text_ids=cur_input_ids, language_masked_pos=None,
+                padding_mask=padding_masks, text_len=cur_len, incremental_state=incremental_state)
+            incremental_state = incremental_state_next
+            # assert outputs.shape[1] == token_len
+            scores = outputs[:, next_token_idx, :] # (batch_size * num_beams, vocab_size)
+            scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
+            assert scores.size() == (batch_size * self.num_beams, self.vocab_size)
+            # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
+            _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+            # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+            _scores = _scores.view(batch_size, self.num_beams * self.vocab_size)  # (batch_size, num_beams * vocab_size)
+            next_scores, next_words = torch.topk(_scores, TOPN_PER_BEAM * self.num_beams, dim=1, largest=True, sorted=True)
+            assert next_scores.size() == next_words.size() == (batch_size, TOPN_PER_BEAM * self.num_beams)
+            # next batch beam content
+            # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
+            next_batch_beam = []
+            # for each sentence
+            for batch_ex in range(batch_size):
+                # if we are done with this sentence
+                done[batch_ex] = done[batch_ex] or generated_hyps[batch_ex].is_done(next_scores[batch_ex].max().item())
+                if done[batch_ex]:
+                    next_batch_beam.extend([(0, pad_id, 0)] * self.num_beams)  # pad the batch
+                    continue
+                # next sentence beam content
+                next_sent_beam = []
+                for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]):
+                    # get beam and word IDs
+                    beam_id = idx // self.vocab_size
+                    word_id = idx % self.vocab_size
+                    # end of sentence, or next word
+                    # if word_id.item() in eos_token_ids or cur_len + 1 == max_len:
+                    if (word_id.item() in eos_token_ids and cur_len + 1 <= self.max_len) or (cur_len + 1 == self.max_len):
+                        generated_hyps[batch_ex].add(
+                            decoding_results[batch_ex * self.num_beams + beam_id, :cur_len].clone(), score.item()
+                        )
+                    else:
+                        next_sent_beam.append((score, word_id, batch_ex * self.num_beams + beam_id))
+                    # the beam for next step is full
+                    if len(next_sent_beam) == self.num_beams:
+                        break
+                # update next beam content
+                if cur_len + 1 == self.max_len:
+                    assert len(next_sent_beam) == 0
+                else:
+                    assert len(next_sent_beam) == self.num_beams
+                if len(next_sent_beam) == 0:
+                    next_sent_beam = [(0, pad_id, 0)] * self.num_beams  # pad the batch
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == self.num_beams * (batch_ex + 1)
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * self.num_beams
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_words = cur_input_ids.new([x[1] for x in next_batch_beam])
+            beam_idx = cur_input_ids.new([x[2] for x in next_batch_beam])
+            # re-order batch
+            cur_input_ids = cur_input_ids[beam_idx, :]
+            decoding_results = decoding_results[beam_idx, :]
+            for module in incremental_state:
+                for key in incremental_state[module]:
+                    result = incremental_state[module][key].index_select(0, beam_idx)
+                    incremental_state[module][key] = result[:,:,:-1,:]
+            next_ids = torch.full(
+                (batch_size * self.num_beams, 1), mask_id, dtype=torch.long, device=image.device
+            )
+            cur_input_ids = torch.cat([beam_words.unsqueeze(1), next_ids], dim=1)
+            decoding_results[:, cur_len-1] = beam_words
+            # update current length
+            cur_len = cur_len + 1
+            # stop when we are done with each sentence
+            if all(done):
+                break
+        # select the best hypotheses
+        tgt_len = torch.ones(batch_size, num_keep_best, dtype=torch.long)
+        logprobs = torch.zeros(batch_size, num_keep_best,
+                    dtype=torch.float).fill_(-1e5).to(cur_input_ids.device)
+        all_best = []
+        for i, hypotheses in enumerate(generated_hyps):
+                best = []
+                hyp_scores = torch.tensor([x[0] for x in hypotheses.hyp])
+                _, best_indices = torch.topk(hyp_scores,
+                        min(num_keep_best, len(hyp_scores)), largest=True)
+                for best_idx, hyp_idx in enumerate(best_indices):
+                    conf, best_hyp = hypotheses.hyp[hyp_idx]
+                    best.append(best_hyp)
+                    logprobs[i, best_idx] = conf
+                    tgt_len[i, best_idx] = len(best_hyp) + 1  # +1 for the <EOS> symbol
+                all_best.append(best)
+        # generate target batch, pad to the same length
+        decoded = cur_input_ids.new(batch_size, num_keep_best, self.max_len).fill_(pad_id)
+        for batch_idx, best in enumerate(all_best):
+            for best_idx, hypo in enumerate(best):
+                decoded[batch_idx, best_idx, : tgt_len[batch_idx, best_idx] - 1] = hypo
+                decoded[batch_idx, best_idx, tgt_len[batch_idx, best_idx] - 1] = eos_token_ids[0]
+        captions = self.tokenizer.batch_decode(decoded.squeeze(1), skip_special_tokens=True)
+        for qid, pred in zip(image_id, captions):
+            self.predictions.append({
+                "image_id": qid.item(),
+                "caption": pred,
+            })
+    def after_eval(self, **kwargs):
+        return self.predictions, "prediction"
+def get_handler(args):
+    if args.task == "nlvr2":
+        return NLVR2Handler()
+    elif args.task == "vqav2":
+        return VQAHandler()
+    elif args.task in ("flickr30k", "coco_retrieval"):
+        return RetrievalHandler()
+    elif args.task in ("coco_captioning", "nocaps"):
+        return CaptioningHandler(args)
+    elif args.task in ("imagenet"):
+        return ImageNetHandler(args)
+    else:
+        raise NotImplementedError("Sorry, %s is not support." % args.task)
+def train_one_epoch(
+        model: torch.nn.Module, data_loader: Iterable,
+        optimizer: torch.optim.Optimizer, device: torch.device,
+        handler: TaskHandler, epoch: int, start_steps: int,
+        lr_schedule_values: list, loss_scaler, max_norm: float = 0,
+        update_freq: int = 1, model_ema: Optional[ModelEma] = None,
+        log_writer: Optional[utils.TensorboardLogger] = None,
+        task = None, mixup_fn=None,
+):
+    model.train(True)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    metric_logger.add_meter('min_lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 10
+    if loss_scaler is None:
+        model.zero_grad()
+        model.micro_steps = 0
+    else:
+        optimizer.zero_grad()
+    for data_iter_step, data in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+        step = data_iter_step // update_freq
+        global_step = start_steps + step  # global training iteration
+        # Update LR & WD for the first acc
+        if lr_schedule_values is not None and data_iter_step % update_freq == 0:
+            for i, param_group in enumerate(optimizer.param_groups):
+                if lr_schedule_values is not None:
+                    param_group["lr"] = lr_schedule_values[global_step] * param_group["lr_scale"]
+        # put input data into cuda
+        for tensor_key in data.keys():
+            data[tensor_key] = data[tensor_key].to(device, non_blocking=True)
+            # print("input %s = %s" % (tensor_key, data[tensor_key]))
+            if loss_scaler is None and tensor_key.startswith("image"):
+                data[tensor_key] = data[tensor_key].half()
+        # mixup for imagenet finetuning
+        if mixup_fn is not None:
+            data["image"], data["label"] = mixup_fn(data["image"], data["label"])
+        if task in ["coco_captioning", "nocaps"]:
+            data["global_step"] = global_step
+        if loss_scaler is None:
+            results = handler.train_batch(model, **data)
+        else:
+            with torch.cuda.amp.autocast():
+                results = handler.train_batch(model, **data)
+        loss = results.pop("loss")
+        loss_value = loss.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+        if loss_scaler is None:
+            loss /= update_freq
+            model.backward(loss)
+            model.step()
+            if (data_iter_step + 1) % update_freq == 0:
+                # model.zero_grad()
+                # Deepspeed will call step() & model.zero_grad() automatic
+                if model_ema is not None:
+                    model_ema.update(model)
+            grad_norm = None
+            loss_scale_value = utils.get_loss_scale_for_deepspeed(model)
+        else:
+            # this attribute is added by timm on one optimizer (adahessian)
+            is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+            loss /= update_freq
+            grad_norm = loss_scaler(loss, optimizer, clip_grad=max_norm,
+                                    parameters=model.parameters(), create_graph=is_second_order,
+                                    update_grad=(data_iter_step + 1) % update_freq == 0)
+            if (data_iter_step + 1) % update_freq == 0:
+                optimizer.zero_grad()
+                if model_ema is not None:
+                    model_ema.update(model)
+            loss_scale_value = loss_scaler.state_dict()["scale"]
+        torch.cuda.synchronize()
+        metric_logger.update(loss=loss_value)
+        metric_logger.update(loss_scale=loss_scale_value)
+        min_lr = 10.
+        max_lr = 0.
+        for group in optimizer.param_groups:
+            min_lr = min(min_lr, group["lr"])
+            max_lr = max(max_lr, group["lr"])
+        metric_logger.update(lr=max_lr)
+        metric_logger.update(min_lr=min_lr)
+        weight_decay_value = None
+        for group in optimizer.param_groups:
+            if group["weight_decay"] > 0:
+                weight_decay_value = group["weight_decay"]
+        metric_logger.update(weight_decay=weight_decay_value)
+        metric_logger.update(grad_norm=grad_norm)
+        if log_writer is not None:
+            kwargs = {
+                "loss": loss_value,
+            }
+            for key in results:
+                kwargs[key] = results[key]
+            log_writer.update(head="train", **kwargs)
+            kwargs = {
+                "loss_scale": loss_scale_value,
+                "lr": max_lr,
+                "min_lr": min_lr,
+                "weight_decay": weight_decay_value,
+                "grad_norm": grad_norm,
+            }
+            log_writer.update(head="opt", **kwargs)
+            log_writer.set_step()
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+@torch.no_grad()
+def evaluate(data_loader, model, device, handler):
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+    # switch to evaluation mode
+    model.eval()
+    handler.before_eval(metric_logger=metric_logger, data_loader=data_loader)
+    for data in metric_logger.log_every(data_loader, 10, header):
+        for tensor_key in data.keys():
+            data[tensor_key] = data[tensor_key].to(device, non_blocking=True)
+        with torch.cuda.amp.autocast():
+            handler.eval_batch(model=model, **data)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    return handler.after_eval()

model/unilm/beit3/get_started/get_started_for_captioning.md ADDED Viewed

	@@ -0,0 +1,176 @@

+# Fine-tuning BEiT-3 on Image Captioning
+## COCO Captioning Setup
+1. [Setup environment](../README.md#setup).
+2. Download [2014 train images](http://images.cocodataset.org/zips/train2014.zip), [2014 val images](http://images.cocodataset.org/zips/val2014.zip) and [karpathy split](https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip), then organize the dataset as following structure:
+```
+/path/to/your_data/
+  train2014/
+    COCO_train2014_000000000009.jpg
+    ...
+  val2014/
+    COCO_val2014_000000000042.jpg
+    ...
+  dataset_coco.json
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import CaptioningDataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+CaptioningDataset.make_coco_captioning_dataset_index(
+    data_path="/path/to/your_data",
+    tokenizer=tokenizer,
+)
+```
+## NoCaps Setup
+1. [Setup environment](README.md#setup).
+2. Download [NoCaps val set](https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json), [NoCaps test set](https://s3.amazonaws.com/nocaps/nocaps_test_image_info.json) and download imags using the urls in val and test json files, then organize the dataset as following structure:
+```
+/path/to/your_data/
+  val/
+    09c863d76bcf6b00.jpg
+    ...
+  test/
+    19dc6913830a0a21.jpg
+    ...
+  nocaps_val_4500_captions.json
+  nocaps_test_image_info.json
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import CaptioningDataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+CaptioningDataset.make_nocaps_captioning_dataset_index(
+    data_path="/path/to/your_data",
+)
+```
+We use COCO captioning training set as the training data of NoCaps.
+## Example: Fine-tuning BEiT-3 on Captioning
+The BEiT-3 **base** model can be fine-tuned on captioning tasks using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_480 \
+        --input_size 480 \
+        --task coco_captioning \
+        --batch_size 32 \
+        --layer_decay 1.0 \
+        --lr 4e-5 \
+        --randaug \
+        --epochs 10 \
+        --warmup_epochs 1 \
+        --drop_path 0.1 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --num_max_bpe_tokens 32 \
+        --captioning_mask_prob 0.7 \
+        --drop_worst_after 12000 \
+        --dist_eval \
+        --checkpoint_activations \
+        --enable_deepspeed
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*32 = 256`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models).
+- `--task`: **coco_captioning** for COCO captioning and **nocaps** for NoCaps dataset.
+- `lr`: 4e-5 for COCO captioning and 1e-5 for NoCaps.
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory.
+The BEiT-3 **large** model can be fine-tuned on captioning tasks using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_480 \
+        --input_size 480 \
+        --task coco_captioning \
+        --batch_size 32 \
+        --layer_decay 1.0 \
+        --lr 8e-6 \
+        --randaug \
+        --epochs 10 \
+        --warmup_epochs 1 \
+        --drop_path 0.1 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --num_max_bpe_tokens 32 \
+        --captioning_mask_prob 0.7 \
+        --drop_worst_after 12000 \
+        --dist_eval \
+        --checkpoint_activations \
+        --enable_deepspeed
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*32 = 256`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models).
+- `--task`: **coco_captioning** for COCO captioning and **nocaps** for NoCaps dataset.
+- `lr`: 8e-6 for COCO captioning and NoCaps.
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory.
+## Example: Evaluate BEiT-3 Fine-tuned model on Captioning
+- Get the prediction file of the fine-tuned BEiT3-base model on captioning with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_480 \
+        --input_size 480 \
+        --task coco_captioning \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_480_coco_captioning.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_prediction \
+        --eval \
+        --dist_eval
+```
+- `--task`: **coco_captioning** for COCO captioning and **nocaps** for NoCaps dataset.
+- `--finetune`: **beit3_base_patch16_480_coco_captioning.pth** for COCO captioning and **beit3_base_patch16_480_nocaps.pth** for NoCaps dataset.
+- Get the prediction file of the fine-tuned BEiT3-large model on captioning with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_480 \
+        --input_size 480 \
+        --task coco_captioning \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_480_coco_captioning.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_prediction \
+        --eval \
+        --dist_eval
+```
+- `--task`: **coco_captioning** for COCO captioning and **nocaps** for NoCaps dataset.
+- `--finetune`: **beit3_large_patch16_480_coco_captioning.pth** for COCO captioning and **beit3_large_patch16_480_nocaps.pth** for NoCaps dataset.
+Please then submit the prediction file in the `output_dir` to the [evaluation server](https://eval.ai/web/challenges/challenge-page/355/overview) to obtain the NoCaps val and test results.

model/unilm/beit3/get_started/get_started_for_image_classification.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# Fine-tuning BEiT-3 on ImageNet-1k (Image Classification)
+## Setup
+1. [Setup environment](../README.md#setup).
+2. Download and extract ImageNet-1k from http://image-net.org/.
+The directory structure is the standard layout of torchvision's [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder). The training and validation data are expected to be in the `train/` folder and `val/` folder, respectively:
+```
+/path/to/imagenet/
+  train/
+    class1/
+      img1.jpeg
+    class2/
+      img2.jpeg
+  val/
+    class1/
+      img3.jpeg
+    class/2
+      img4.jpeg
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import ImageNetDataset
+ImageNetDataset.make_dataset_index(
+    train_data_path = "/path/to/your_data/train",
+    val_data_path = "/path/to/your_data/val",
+    index_path = "/path/to/your_data"
+)
+```
+## Example: Fine-tuning BEiT-3 on ImageNet-1k (Image Classification)
+The BEiT-3 **base** model can be finetuned on ImageNet-1k using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_224 \
+        --task imagenet \
+        --batch_size 128 \
+        --layer_decay 0.65 \
+        --lr 7e-4 \
+        --update_freq 1 \
+        --epochs 50 \
+        --warmup_epochs 5 \
+        --drop_path 0.15 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --dist_eval \
+        --mixup 0.8 \
+        --cutmix 1.0 \
+        --enable_deepspeed
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*128*1 = 1024`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+The BEiT-3 **large** model can be finetuned on ImageNet-1k using a DGX box (8 V100-32GB):
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_224 \
+        --task imagenet \
+        --batch_size 128 \
+        --layer_decay 0.8 \
+        --lr 2e-4 \
+        --update_freq 1 \
+        --epochs 50 \
+        --warmup_epochs 5 \
+        --drop_path 0.25 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --dist_eval \
+        --mixup 0.8 \
+        --cutmix 1.0 \
+        --enable_deepspeed \
+        --checkpoint_activations
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*128 = 1024`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory
+## Example: Evaluate BEiT-3 Finetuned model on ImageNet-1k (Image Classification)
+- Evaluate our fine-tuned BEiT3-base model on ImageNet val with a single GPU:
+```bash
+python -m torch.distributed.launch --nproc_per_node=1 run_beit3_finetuning.py \
+        --model beit3_base_patch16_224 \
+        --task imagenet \
+        --batch_size 128 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224_in1k.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+Expected results:
+```
+* Acc@1 85.400 Acc@5 97.630
+```
+- Evaluate our fine-tuned BEiT3-large model on ImageNet val with a single GPU:
+```bash
+python -m torch.distributed.launch --nproc_per_node=1 run_beit3_finetuning.py \
+        --model beit3_large_patch16_224 \
+        --task imagenet \
+        --batch_size 128 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224_in1k.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+Expected results:
+```
+* Acc@1 87.580 Acc@5 98.326
+```

model/unilm/beit3/get_started/get_started_for_nlvr2.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# Fine-tuning BEiT-3 on NLVR2 (Visual Reasoning)
+## Setup
+1. [Setup environment](../README.md#setup).
+2. Clone the [repository](https://github.com/lil-lab/nlvr) and sign the [request form](https://goo.gl/forms/yS29stWnFWzrDBFH3) to download the images, then organize the dataset as following structure:
+```
+/path/to/your_data/
+  images/train/
+    0/train-11670-0-img0.png
+    ...
+  dev/
+    dev-269-0-img0.png
+    ...
+  test1/
+    test1-261-0-img0.png
+    ...
+  nlvr/ (nlvr repo)
+    nlvr/
+    nlvr2/
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import NLVR2Dataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+NLVR2Dataset.make_dataset_index(
+    data_path="/path/to/your_data",
+    tokenizer=tokenizer,
+    nlvr_repo_path="/path/to/your_data/nlvr"
+)
+```
+## Example: Fine-tuning BEiT-3 on NLVR2 (Visual Reasoning)
+The BEiT-3 **base** model can be finetuned on NLVR2 using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_224 \
+        --task nlvr2 \
+        --batch_size 32 \
+        --layer_decay 0.65 \
+        --lr 7e-4 \
+        --epochs 20 \
+        --warmup_epochs 5 \
+        --drop_path 0.2 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.2 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --enable_deepspeed
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*32 = 256`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models).
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--lr`: 7e-4 for `BEiT3-base`, 5e-4 for `BEiT3-base-indomain`.
+The BEiT-3 **large** model can be finetuned on NLVR2 using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_224 \
+        --task nlvr2 \
+        --batch_size 32 \
+        --layer_decay 0.85 \
+        --lr 3e-4 \
+        --epochs 20 \
+        --warmup_epochs 5 \
+        --drop_path 0.2 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.2 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --enable_deepspeed \
+        --checkpoint_activations
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*32 = 256`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models).
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--lr`: 3e-4 for `BEiT3-large`, 1e-4 for `BEiT3-large-indomain`.
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory.
+## Example: Evaluate BEiT-3 Finetuned model on NLVR2 (Visual Reasoning)
+- Get the result of our fine-tuned BEiT3-base model on NLVR2 test with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_224 \
+        --task nlvr2 \
+        --batch_size 32 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224_nlvr2.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+Expected results:
+```
+* Acc 84.386
+```
+- Get the result of our fine-tuned BEiT3-large model on NLVR2 test with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_224 \
+        --task nlvr2 \
+        --batch_size 32 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224_nlvr2.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+Expected results:
+```
+* Acc 89.437
+```

model/unilm/beit3/get_started/get_started_for_retrieval.md ADDED Viewed

	@@ -0,0 +1,161 @@

+# Fine-tuning BEiT-3 on Image-text Retrieval
+## COCO Retrieval Setup
+1. [Setup environment](../README.md#setup).
+2. Download [2014 train images](http://images.cocodataset.org/zips/train2014.zip), [2014 val images](http://images.cocodataset.org/zips/val2014.zip) and [karpathy split](https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip), then organize the dataset as following structure:
+```
+/path/to/your_data/
+  train2014/
+    COCO_train2014_000000000009.jpg
+    ...
+  val2014/
+    COCO_val2014_000000000042.jpg
+    ...
+  dataset_coco.json
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import RetrievalDataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+RetrievalDataset.make_coco_dataset_index(
+    data_path="/path/to/your_data",
+    tokenizer=tokenizer,
+)
+```
+## Flickr30k Retrieval Setup
+1. [Setup environment](README.md#setup).
+2. Sign [flickr images request form](https://forms.illinois.edu/sec/229675) and download [karpathy split](https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip), then organize the dataset as following structure:
+```
+/path/to/your_data/
+  flickr30k-images/
+    2923475135.jpg
+    ...
+  dataset_flickr30k.json
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import RetrievalDataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+RetrievalDataset.make_flickr30k_dataset_index(
+    data_path="/path/to/your_data",
+    tokenizer=tokenizer,
+    karpathy_path="/path/to/your_data",
+)
+```
+## Example: Fine-tuning BEiT-3 on Retrieval
+The BEiT-3 **base** model can be finetuned on retrieval tasks using 16 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=16 run_beit3_finetuning.py \
+        --model beit3_base_patch16_384 \
+        --input_size 384 \
+        --task coco_retrieval \
+        --batch_size 192 \
+        --layer_decay 0.65 \
+        --lr 2e-4 \
+        --epochs 15 \
+        --warmup_epochs 3 \
+        --drop_path 0.2 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_itc_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --enable_deepspeed \
+        --checkpoint_activations
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `192*16 = 3072`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--task`: **coco_retrieval** for COCO retrieval, **flickr30k** for Flickr30k retrieval
+- `--lr`: 2e-4 for COCO retrieval, 1e-4 for Flickr30k retrieval
+- `--epochs`: 15 for COCO retrieval, 20 for Flickr30k retrieval
+- `--warmup_epochs`: 3 for COCO retrieval, 5 for Flickr30k retrieval
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory
+The BEiT-3 **large** model can be finetuned on retrieval tasks using 2x16 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=16 --nnodes=2 --node_rank=$NODE_RANK \
+       --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT run_beit3_finetuning.py \
+        --model beit3_large_patch16_384 \
+        --input_size 384 \
+        --task coco_retrieval \
+        --batch_size 96 \
+        --layer_decay 0.85 \
+        --lr 5e-5 \
+        --epochs 15 \
+        --warmup_epochs 3 \
+        --drop_path 0.2 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_itc_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.05 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --enable_deepspeed \
+        --checkpoint_activations
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `96*32 = 3072`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--task`: **coco_retrieval** for COCO retrieval, **flickr30k** for Flickr30k retrieval
+- `--epochs`: 15 for COCO retrieval, 20 for Flickr30k retrieval
+- `--warmup_epochs`: 3 for COCO retrieval, 5 for Flickr30k retrieval
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory
+## Example: Evaluate BEiT-3 Fine-tuned model on COCO Retrieval and Flickr30k Retrieval
+- Get the results of our fine-tuned BEiT3-base model on retrieval tasks using a single GPU:
+```bash
+python -m torch.distributed.launch --nproc_per_node=1 run_beit3_finetuning.py \
+        --model beit3_base_patch16_384 \
+        --input_size 384 \
+        --task coco_retrieval \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_384_coco_retrieval.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+- `--task`: **coco_retrieval** for COCO retrieval, **flickr30k** for Flickr30k retrieval
+- `--finetune`: **beit3_base_patch16_384_coco_retrieval.pth** for COCO retrieval, **beit3_base_patch16_384_f30k_retrieval.pth** for Flickr30k retrieval
+- Get the results of our fine-tuned BEiT3-large model on retrieval tasks using a single GPU:
+```bash
+python -m torch.distributed.launch --nproc_per_node=1 run_beit3_finetuning.py \
+        --model beit3_large_patch16_384 \
+        --input_size 384 \
+        --task coco_retrieval \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_384_coco_retrieval.pth \
+        --data_path /path/to/your_data \
+        --eval \
+        --dist_eval
+```
+- `--task`: **coco_retrieval** for COCO retrieval, **flickr30k** for Flickr30k retrieval
+- `--finetune`: **beit3_large_patch16_384_coco_retrieval.pth** for COCO retrieval, **beit3_large_patch16_384_f30k_retrieval.pth** for Flickr30k retrieval

model/unilm/beit3/get_started/get_started_for_vqav2.md ADDED Viewed

	@@ -0,0 +1,144 @@

+# Fine-tuning BEiT-3 on VQAv2 (Visual Question Answering)
+## Setup
+1. [Setup environment](../README.md#setup).
+2. Download COCO [2014 train images](http://images.cocodataset.org/zips/train2014.zip), [2014 val images](http://images.cocodataset.org/zips/val2014.zip), [2015 test images](http://images.cocodataset.org/zips/test2015.zip), annotations ([train](https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip), [val](https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip)), and questions ([train](https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip), [val](https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip), [test](https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip)), then organize the dataset as following structure:
+```
+/path/to/your_data/
+  train2014/
+    COCO_train2014_000000000009.jpg
+    ...
+  val2014/
+    COCO_val2014_000000000042.jpg
+    ...
+  test2015/
+    COCO_test2015_000000000001.jpg
+    ...
+  vqa/
+    v2_OpenEnded_mscoco_train2014_questions.json
+    v2_OpenEnded_mscoco_val2014_questions.json
+    v2_OpenEnded_mscoco_test2015_questions.json
+    v2_OpenEnded_mscoco_test-dev2015_questions.json
+    v2_mscoco_train2014_annotations.json
+    v2_mscoco_val2014_annotations.json
+```
+We then generate the index json files using the following command. [beit3.spm](https://github.com/addf400/files/releases/download/beit3/beit3.spm) is the sentencepiece model used for tokenizing texts.
+```
+from datasets import VQAv2Dataset
+from transformers import XLMRobertaTokenizer
+tokenizer = XLMRobertaTokenizer("/your_beit3_model_path/beit3.spm")
+VQAv2Dataset.make_dataset_index(
+    data_path="/path/to/your_data",
+    tokenizer=tokenizer,
+    annotation_data_path="/path/to/your_data/vqa",
+)
+```
+## Example: Fine-tuning BEiT-3 on VQAv2 (Visual Question Answering)
+The BEiT-3 **base** model can be finetuned on VQAv2 using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_480 \
+        --input_size 480 \
+        --task vqav2 \
+        --batch_size 16 \
+        --layer_decay 1.0 \
+        --lr 3e-5 \
+        --update_freq 1 \
+        --randaug \
+        --epochs 10 \
+        --warmup_epochs 1 \
+        --drop_path 0.1 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.01 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --task_head_lr_weight 20 \
+        --opt_betas 0.9 0.98 \
+        --enable_deepspeed
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*16 = 128`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+The BEiT-3 **large** model can be finetuned on VQAv2 using 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_480 \
+        --input_size 480 \
+        --task vqav2 \
+        --batch_size 16 \
+        --layer_decay 1.0 \
+        --lr 2e-5 \
+        --update_freq 1 \
+        --randaug \
+        --epochs 10 \
+        --warmup_epochs 1 \
+        --drop_path 0.15 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_224.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_model \
+        --log_dir /path/to/save/your_model/log \
+        --weight_decay 0.01 \
+        --seed 42 \
+        --save_ckpt_freq 5 \
+        --task_head_lr_weight 20 \
+        --opt_betas 0.9 0.98 \
+        --enable_deepspeed \
+        --checkpoint_activations
+```
+- `--batch_size`: batch size per GPU. Effective batch size = `number of GPUs` * `--batch_size` * `--update_freq`. So in the above example, the effective batch size is `8*16 = 128`.
+- `--finetune`: weight path of your pretrained models; please download the pretrained model weights in [README.md](../README.md#pretrained-models)
+- `--enable_deepspeed`: optional. If you use apex, please enable deepspeed.
+- `--checkpoint_activations`: using gradient checkpointing for saving GPU memory
+## Example: Evaluate BEiT-3 Finetuned model on VQAv2 (Visual Question Answering)
+- Get the prediction file of the fine-tuned BEiT3-base model on VQAv2 test with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_base_patch16_480 \
+        --input_size 480 \
+        --task vqav2 \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_base_patch16_480_vqa.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_prediction \
+        --eval \
+        --dist_eval
+```
+- Get the prediction file of the fine-tuned BEiT3-large model on VQAv2 test with 8 V100-32GB:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_beit3_finetuning.py \
+        --model beit3_large_patch16_480 \
+        --input_size 480 \
+        --task vqav2 \
+        --batch_size 16 \
+        --sentencepiece_model /your_beit3_model_path/beit3.spm \
+        --finetune /your_beit3_model_path/beit3_large_patch16_480_vqa.pth \
+        --data_path /path/to/your_data \
+        --output_dir /path/to/save/your_prediction \
+        --eval \
+        --dist_eval
+```
+Please then submit the prediction file in the `output_dir` to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/overview) to obtain the VQAv2 test-dev and test-std results.

model/unilm/beit3/glossary.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import re
+contractions = {
+    "aint": "ain't",
+    "arent": "aren't",
+    "cant": "can't",
+    "couldve": "could've",
+    "couldnt": "couldn't",
+    "couldn'tve": "couldn't've",
+    "couldnt've": "couldn't've",
+    "didnt": "didn't",
+    "doesnt": "doesn't",
+    "dont": "don't",
+    "hadnt": "hadn't",
+    "hadnt've": "hadn't've",
+    "hadn'tve": "hadn't've",
+    "hasnt": "hasn't",
+    "havent": "haven't",
+    "hed": "he'd",
+    "hed've": "he'd've",
+    "he'dve": "he'd've",
+    "hes": "he's",
+    "howd": "how'd",
+    "howll": "how'll",
+    "hows": "how's",
+    "Id've": "I'd've",
+    "I'dve": "I'd've",
+    "Im": "I'm",
+    "Ive": "I've",
+    "isnt": "isn't",
+    "itd": "it'd",
+    "itd've": "it'd've",
+    "it'dve": "it'd've",
+    "itll": "it'll",
+    "let's": "let's",
+    "maam": "ma'am",
+    "mightnt": "mightn't",
+    "mightnt've": "mightn't've",
+    "mightn'tve": "mightn't've",
+    "mightve": "might've",
+    "mustnt": "mustn't",
+    "mustve": "must've",
+    "neednt": "needn't",
+    "notve": "not've",
+    "oclock": "o'clock",
+    "oughtnt": "oughtn't",
+    "ow's'at": "'ow's'at",
+    "'ows'at": "'ow's'at",
+    "'ow'sat": "'ow's'at",
+    "shant": "shan't",
+    "shed've": "she'd've",
+    "she'dve": "she'd've",
+    "she's": "she's",
+    "shouldve": "should've",
+    "shouldnt": "shouldn't",
+    "shouldnt've": "shouldn't've",
+    "shouldn'tve": "shouldn't've",
+    "somebody'd": "somebodyd",
+    "somebodyd've": "somebody'd've",
+    "somebody'dve": "somebody'd've",
+    "somebodyll": "somebody'll",
+    "somebodys": "somebody's",
+    "someoned": "someone'd",
+    "someoned've": "someone'd've",
+    "someone'dve": "someone'd've",
+    "someonell": "someone'll",
+    "someones": "someone's",
+    "somethingd": "something'd",
+    "somethingd've": "something'd've",
+    "something'dve": "something'd've",
+    "somethingll": "something'll",
+    "thats": "that's",
+    "thered": "there'd",
+    "thered've": "there'd've",
+    "there'dve": "there'd've",
+    "therere": "there're",
+    "theres": "there's",
+    "theyd": "they'd",
+    "theyd've": "they'd've",
+    "they'dve": "they'd've",
+    "theyll": "they'll",
+    "theyre": "they're",
+    "theyve": "they've",
+    "twas": "'twas",
+    "wasnt": "wasn't",
+    "wed've": "we'd've",
+    "we'dve": "we'd've",
+    "weve": "we've",
+    "werent": "weren't",
+    "whatll": "what'll",
+    "whatre": "what're",
+    "whats": "what's",
+    "whatve": "what've",
+    "whens": "when's",
+    "whered": "where'd",
+    "wheres": "where's",
+    "whereve": "where've",
+    "whod": "who'd",
+    "whod've": "who'd've",
+    "who'dve": "who'd've",
+    "wholl": "who'll",
+    "whos": "who's",
+    "whove": "who've",
+    "whyll": "why'll",
+    "whyre": "why're",
+    "whys": "why's",
+    "wont": "won't",
+    "wouldve": "would've",
+    "wouldnt": "wouldn't",
+    "wouldnt've": "wouldn't've",
+    "wouldn'tve": "wouldn't've",
+    "yall": "y'all",
+    "yall'll": "y'all'll",
+    "y'allll": "y'all'll",
+    "yall'd've": "y'all'd've",
+    "y'alld've": "y'all'd've",
+    "y'all'dve": "y'all'd've",
+    "youd": "you'd",
+    "youd've": "you'd've",
+    "you'dve": "you'd've",
+    "youll": "you'll",
+    "youre": "you're",
+    "youve": "you've",
+}
+manual_map = {
+    "none": "0",
+    "zero": "0",
+    "one": "1",
+    "two": "2",
+    "three": "3",
+    "four": "4",
+    "five": "5",
+    "six": "6",
+    "seven": "7",
+    "eight": "8",
+    "nine": "9",
+    "ten": "10",
+}
+articles = ["a", "an", "the"]
+period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
+comma_strip = re.compile("(\d)(\,)(\d)")
+punct = [
+    ";",
+    r"/",
+    "[",
+    "]",
+    '"',
+    "{",
+    "}",
+    "(",
+    ")",
+    "=",
+    "+",
+    "\\",
+    "_",
+    "-",
+    ">",
+    "<",
+    "@",
+    "`",
+    ",",
+    "?",
+    "!",
+]
+def normalize_word(token):
+    _token = token
+    for p in punct:
+        if (p + " " in token or " " + p in token) or (
+            re.search(comma_strip, token) != None
+        ):
+            _token = _token.replace(p, "")
+        else:
+            _token = _token.replace(p, " ")
+    token = period_strip.sub("", _token, re.UNICODE)
+    _token = []
+    temp = token.lower().split()
+    for word in temp:
+        word = manual_map.setdefault(word, word)
+        if word not in articles:
+            _token.append(word)
+    for i, word in enumerate(_token):
+        if word in contractions:
+            _token[i] = contractions[word]
+    token = " ".join(_token)
+    token = token.replace(",", "")
+    return token

model/unilm/beit3/modeling_finetune.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.registry import register_model
+import numpy as np
+import utils
+from modeling_utils import BEiT3Wrapper, _get_base_config, _get_large_config
+class TwoLayerMLP(nn.Module):
+    def __init__(
+            self,
+            in_features,
+            hidden_features,
+            out_features,
+            norm_layer,
+            norm_input=True,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(in_features) if norm_input else nn.Identity()
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        self.norm2 = norm_layer(hidden_features)
+        self.act = nn.GELU()
+        self.dense2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.dense1(x)
+        x = self.norm2(x)
+        x = self.act(x)
+        return self.dense2(x)
+class Pooler(nn.Module):
+    def __init__(self, input_features, output_features, norm_layer):
+        super().__init__()
+        self.norm = norm_layer(input_features)
+        self.dense = nn.Linear(input_features, output_features)
+        self.activation = nn.Tanh()
+    def forward(self, x):
+        cls_rep = x[:, 0, :]
+        cls_rep = self.norm(cls_rep)
+        pooled_output = self.dense(cls_rep)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BEiT3ForVisualReasoning(BEiT3Wrapper):
+    def __init__(
+            self,
+            args,
+            num_classes,
+            norm_layer=nn.LayerNorm,
+            **kwargs
+    ):
+        super(BEiT3ForVisualReasoning, self).__init__(args=args)
+        embed_dim = args.encoder_embed_dim
+        self.head = TwoLayerMLP(
+            in_features=embed_dim * 4,
+            hidden_features=embed_dim * 2,
+            out_features=num_classes,
+            norm_layer=norm_layer,
+        )
+        init_scale = 0.001
+        self.head.apply(self._init_weights)
+        if isinstance(self.head.dense1, nn.Linear):
+            self.head.dense1.weight.data.mul_(init_scale)
+            self.head.dense1.bias.data.mul_(init_scale)
+        if isinstance(self.head.dense2, nn.Linear):
+            self.head.dense2.weight.data.mul_(init_scale)
+            self.head.dense2.bias.data.mul_(init_scale)
+    def forward(self, image_a, image_b, text_description, padding_mask, **kwargs):
+        bsz, _ = text_description.size()
+        vision_input = torch.cat((image_a, image_b), dim=0)
+        language_input = torch.cat((text_description, text_description), dim=0)
+        padding_mask = torch.cat((padding_mask, padding_mask), dim=0)
+        outputs = self.beit3(
+            textual_tokens=language_input,
+            visual_tokens=vision_input,
+            text_padding_position=padding_mask,
+        )
+        x = outputs["encoder_out"]
+        multiway_split_position = outputs["multiway_split_position"]
+        vision_cls = x[:, 0, :]
+        language_cls = x[:, multiway_split_position, :]
+        cls_rep = torch.cat((vision_cls, language_cls), dim=-1)
+        a, b = torch.split(cls_rep, split_size_or_sections=[bsz, bsz], dim=0)
+        cls_rep = torch.cat((a, b), dim=-1)
+        return self.head(cls_rep)
+class BEiT3ForImageClassification(BEiT3Wrapper):
+    def __init__(
+            self,
+            args,
+            num_classes,
+            norm_layer=nn.LayerNorm,
+            **kwargs
+    ):
+        super(BEiT3ForImageClassification, self).__init__(args=args)
+        embed_dim = args.encoder_embed_dim
+        self.fc_norm = norm_layer(embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.fc_norm.apply(self._init_weights)
+        self.head.apply(self._init_weights)
+        init_scale = 0.001
+        if isinstance(self.head, nn.Linear):
+            self.head.weight.data.mul_(init_scale)
+            self.head.bias.data.mul_(init_scale)
+    def forward(self, image, **kwargs):
+        x = self.beit3(textual_tokens=None, visual_tokens=image)["encoder_out"]
+        t = x[:, 1:, :]
+        cls_x = self.fc_norm(t.mean(1))
+        return self.head(cls_x)
+class BEiT3ForCaptioning(BEiT3Wrapper):
+    def __init__(
+            self,
+            args,
+            **kwargs
+    ):
+        super(BEiT3ForCaptioning, self).__init__(args=args)
+        embed_dim = args.encoder_embed_dim
+        self.mlm_head = nn.Linear(embed_dim, args.vocab_size)
+        self.mlm_head.apply(self._init_weights)
+    def forward(self, image, text_ids, padding_mask, language_masked_pos, text_len=None, incremental_state=None, **kwargs):
+        text_len = text_len if text_len is not None else text_ids.size(1)
+        image_len = self.beit3.vision_embed.num_position_embeddings()
+        max_len = text_len + image_len
+        uni_mask = torch.zeros((max_len, max_len), dtype=torch.long, device=text_ids.device)
+        i_start, i_end = 0, image_len
+        t_start, t_end = image_len, max_len
+        # triangle mask for caption to caption
+        uni_mask[t_start:t_end, t_start:t_end] = torch.tril(torch.ones(text_len, text_len, dtype=torch.long, device=text_ids.device))
+        # full attention for caption to image
+        uni_mask[t_start:t_end, i_start:i_end] = 1
+        # full attention for image to image
+        uni_mask[i_start:i_end, i_start:i_end] = 1
+        uni_mask = 1-uni_mask
+        if incremental_state is not None:
+            for idx in range(self.get_num_layers()):
+                if idx not in incremental_state:
+                    incremental_state[idx] = {}
+        # for incremental decoding
+        positions = None
+        if image is None:
+            uni_mask = uni_mask[-2:]
+            padding_mask = None
+            # start position (2 (fairseq starts at 2) + cur_position) is equal to text_len
+            positions = torch.arange(text_len, text_ids.size(1) + text_len, device=text_ids.device).long().unsqueeze(0)
+        outputs = self.beit3(
+            textual_tokens=text_ids,
+            visual_tokens=image,
+            text_padding_position=padding_mask,
+            attn_mask=uni_mask,
+            incremental_state=incremental_state,
+            positions=positions,
+        )
+        if image is not None:
+            text_feats = outputs["encoder_out"][:, image_len:]
+        else:
+            text_feats = outputs["encoder_out"]
+        if language_masked_pos is not None:
+            text_feats = text_feats[language_masked_pos.bool()]
+        return self.mlm_head(text_feats), incremental_state
+class BEiT3ForVisualQuestionAnswering(BEiT3Wrapper):
+    def __init__(
+            self,
+            args,
+            num_classes,
+            norm_layer=nn.LayerNorm,
+            **kwargs
+    ):
+        super(BEiT3ForVisualQuestionAnswering, self).__init__(args=args)
+        embed_dim = args.encoder_embed_dim
+        self.pooler = Pooler(
+            input_features=embed_dim,
+            output_features=embed_dim,
+            norm_layer=norm_layer,
+        )
+        self.pooler.apply(self._init_weights)
+        self.head = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 2),
+            norm_layer(embed_dim * 2),
+            nn.GELU(),
+            nn.Linear(embed_dim * 2, num_classes),
+        )
+        self.head.apply(self._init_weights)
+    def forward(self, image, question, padding_mask, **kwargs):
+        outputs = self.beit3(
+            textual_tokens=question,
+            visual_tokens=image,
+            text_padding_position=padding_mask,
+        )
+        x = outputs["encoder_out"]
+        cls_rep = self.pooler(x)
+        return self.head(cls_rep)
+class BEiT3ForRetrieval(BEiT3Wrapper):
+    def __init__(
+            self,
+            args,
+            **kwargs
+    ):
+        super(BEiT3ForRetrieval, self).__init__(args=args)
+        embed_dim = args.encoder_embed_dim
+        self.language_head = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.vision_head = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.language_head.apply(self._init_weights)
+        self.vision_head.apply(self._init_weights)
+        self.criterion = utils.ClipLoss(
+            rank=utils.get_rank(),
+            world_size=utils.get_world_size(),
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def forward(self, image=None, text_description=None, padding_mask=None, only_infer=False, **kwargs):
+        if image is not None:
+            outputs = self.beit3(
+                textual_tokens=None,
+                visual_tokens=image,
+                text_padding_position=None,
+            )
+            x = outputs["encoder_out"]
+            vision_cls = self.vision_head(x[:, 0, :])
+            vision_cls = F.normalize(vision_cls, dim=-1)
+        else:
+            vision_cls = None
+        if text_description is not None:
+            outputs = self.beit3(
+                textual_tokens=text_description,
+                visual_tokens=None,
+                text_padding_position=padding_mask,
+            )
+            x = outputs["encoder_out"]
+            language_cls = self.language_head(x[:, 0, :])
+            language_cls = F.normalize(language_cls, dim=-1)
+        else:
+            language_cls = None
+        if only_infer:
+            return vision_cls, language_cls
+        else:
+            loss, logits_per_image, logits_per_text = self.criterion(
+                vision_cls, language_cls, self.logit_scale.exp())
+            return loss, vision_cls, language_cls
+@register_model
+def beit3_base_patch16_224_imageclassification(pretrained=False, **kwargs):
+    args = _get_base_config(**kwargs)
+    args.normalize_output = False
+    model = BEiT3ForImageClassification(args, num_classes=1000, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_224_imageclassification(pretrained=False, **kwargs):
+    args = _get_large_config(**kwargs)
+    args.normalize_output = False
+    model = BEiT3ForImageClassification(args, num_classes=1000, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_224_nlvr2(pretrained=False, **kwargs):
+    args = _get_base_config(**kwargs)
+    model = BEiT3ForVisualReasoning(args, num_classes=2, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_224_nlvr2(pretrained=False, **kwargs):
+    args = _get_large_config(**kwargs)
+    model = BEiT3ForVisualReasoning(args, num_classes=2, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_384_vqav2(pretrained=False, **kwargs):
+    args = _get_base_config(img_size=384, **kwargs)
+    args.normalize_output = False
+    model = BEiT3ForVisualQuestionAnswering(args, num_classes=3129, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_480_vqav2(pretrained=False, **kwargs):
+    args = _get_base_config(img_size=480, **kwargs)
+    args.normalize_output = False
+    model = BEiT3ForVisualQuestionAnswering(args, num_classes=3129, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_384_vqav2(pretrained=False, **kwargs):
+    args = _get_large_config(img_size=384, **kwargs)
+    args.normalize_output = False
+    model = BEiT3ForVisualQuestionAnswering(args, num_classes=3129, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_480_vqav2(pretrained=False, **kwargs):
+    args = _get_large_config(img_size=480, **kwargs)
+    args.normalize_output = False
+    model = BEiT3ForVisualQuestionAnswering(args, num_classes=3129, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_768_vqav2(pretrained=False, **kwargs):
+    args = _get_large_config(img_size=768, **kwargs)
+    args.normalize_output = False
+    model = BEiT3ForVisualQuestionAnswering(args, num_classes=3129, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_224_captioning(pretrained=False, **kwargs):
+    args = _get_base_config(**kwargs)
+    model = BEiT3ForCaptioning(args, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_480_captioning(pretrained=False, **kwargs):
+    args = _get_base_config(img_size=480, **kwargs)
+    model = BEiT3ForCaptioning(args, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_480_captioning(pretrained=False, **kwargs):
+    args = _get_large_config(img_size=480, **kwargs)
+    model = BEiT3ForCaptioning(args, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_224_retrieval(pretrained=False, **kwargs):
+    args = _get_base_config(**kwargs)
+    model = BEiT3ForRetrieval(args, **kwargs)
+    return model
+@register_model
+def beit3_base_patch16_384_retrieval(pretrained=False, **kwargs):
+    args = _get_base_config(img_size=384, **kwargs)
+    model = BEiT3ForRetrieval(args, **kwargs)
+    return model
+@register_model
+def beit3_large_patch16_384_retrieval(pretrained=False, **kwargs):
+    args = _get_large_config(img_size=384, **kwargs)
+    model = BEiT3ForRetrieval(args, **kwargs)
+    return model

model/unilm/beit3/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import math
+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_ as __call_trunc_normal_
+from torchscale.model.BEiT3 import BEiT3
+from torchscale.architecture.config import EncoderConfig
+def trunc_normal_(tensor, mean=0., std=1.):
+    __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)
+def _get_base_config(
+        img_size=224, patch_size=16, drop_path_rate=0,
+        checkpoint_activations=None, mlp_ratio=4, vocab_size=64010, **kwargs
+):
+    return EncoderConfig(
+        img_size=img_size, patch_size=patch_size, vocab_size=vocab_size, multiway=True,
+        layernorm_embedding=False, normalize_output=True, no_output_layer=True,
+        drop_path_rate=drop_path_rate, encoder_embed_dim=768, encoder_attention_heads=12,
+        encoder_ffn_embed_dim=int(768 * mlp_ratio), encoder_layers=12,
+        checkpoint_activations=checkpoint_activations,
+    )
+def _get_large_config(
+        img_size=224, patch_size=16, drop_path_rate=0,
+        checkpoint_activations=None, mlp_ratio=4, vocab_size=64010, **kwargs
+):
+    return EncoderConfig(
+        img_size=img_size, patch_size=patch_size, vocab_size=vocab_size, multiway=True,
+        layernorm_embedding=False, normalize_output=True, no_output_layer=True,
+        drop_path_rate=drop_path_rate, encoder_embed_dim=1024, encoder_attention_heads=16,
+        encoder_ffn_embed_dim=int(1024 * mlp_ratio), encoder_layers=24,
+        checkpoint_activations=checkpoint_activations,
+    )
+class BEiT3Wrapper(nn.Module):
+    def __init__(self, args, **kwargs):
+        super().__init__()
+        self.args = args
+        self.beit3 = BEiT3(args)
+        self.apply(self._init_weights)
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def get_num_layers(self):
+        return self.beit3.encoder.num_layers
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'beit3.encoder.embed_positions.A.weight', 'beit3.vision_embed.cls_token', 'logit_scale'}
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)

model/unilm/beit3/optim_factory.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+from torch import optim as optim
+from timm.optim.lookahead import Lookahead
+import json
+def get_num_layer_for_vit(var_name, num_max_layer):
+    if "embed" in var_name:
+        return 0
+    elif var_name in (
+        "cls_token", "mask_token", "pos_embed", "language_pos_embed",
+        "word_embeddings.weight", "vision_cls_token", "vision_pos_embed"
+    ):
+        return 0
+    elif var_name.startswith("patch_embed"):
+        return 0
+    elif var_name.startswith("rel_pos_bias"):
+        return num_max_layer - 1
+    elif "layers." in var_name:
+        layer_id = int(var_name.split('layers.')[1].split('.')[0])
+        return layer_id + 1
+    else:
+        return num_max_layer - 1
+def get_is_head_flag_for_vit(var_name, num_max_layer):
+    if var_name.startswith("head"):
+        return 1
+    # elif var_name.startswith("pooler"):
+    #     return 1
+    else:
+        return 0
+class LayerDecayValueAssigner(object):
+    def __init__(self, values, scale_handler=None):
+        self.scale_handler = scale_handler or get_num_layer_for_vit
+        self.values = values
+    def get_scale(self, layer_id):
+        return self.values[layer_id]
+    def get_layer_id(self, var_name):
+        return self.scale_handler(var_name, len(self.values))
+# The implementation code is modified from Timm (https://github.com/huggingface/pytorch-image-models/tree/main/timm
+def get_parameter_groups(model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+        if get_num_layer is not None:
+            layer_id = get_num_layer(name)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+        else:
+            layer_id = None
+        if group_name not in parameter_group_names:
+            if get_layer_scale is not None:
+                scale = get_layer_scale(layer_id)
+            else:
+                scale = 1.
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
+def create_optimizer(args, model, get_num_layer=None, get_layer_scale=None, filter_bias_and_bn=True, skip_list=None):
+    opt_lower = args.opt.lower()
+    weight_decay = args.weight_decay
+    if weight_decay and filter_bias_and_bn:
+        skip = {}
+        if skip_list is not None:
+            skip = skip_list
+        elif hasattr(model, 'no_weight_decay'):
+            skip = model.no_weight_decay()
+        parameters = get_parameter_groups(model, weight_decay, skip, get_num_layer, get_layer_scale)
+        weight_decay = 0.
+    else:
+        parameters = model.parameters()
+    opt_args = dict(lr=args.lr, weight_decay=weight_decay)
+    if hasattr(args, 'opt_eps') and args.opt_eps is not None:
+        opt_args['eps'] = args.opt_eps
+    if hasattr(args, 'opt_betas') and args.opt_betas is not None:
+        opt_args['betas'] = args.opt_betas
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    if opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, **opt_args)
+    else:
+        raise ValueError("Invalid optimizer")
+    if len(opt_split) > 1:
+        if opt_split[0] == 'lookahead':
+            optimizer = Lookahead(optimizer)
+    return optimizer

model/unilm/beit3/randaug.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import cv2
+import numpy as np
+## aug functions
+def identity_func(img):
+    return img
+def autocontrast_func(img, cutoff=0):
+    '''
+        same output as PIL.ImageOps.autocontrast
+    '''
+    n_bins = 256
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            offset = -low * scale
+            table = np.arange(n_bins) * scale + offset
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+def equalize_func(img):
+    '''
+        same output as PIL.ImageOps.equalize
+        PIL's implementation is different from cv2.equalize
+    '''
+    n_bins = 256
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0: return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    '''
+    like PIL, rotate by degree, not radians
+    '''
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+def solarize_func(img, thresh=128):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def color_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Color
+    '''
+    ## implementation according to PIL definition, quite slow
+    #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+    #  out = blend(degenerate, img, factor)
+    #  M = (
+    #      np.eye(3) * factor
+    #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+    #  )[np.newaxis, np.newaxis, :]
+    M = (
+            np.float32([
+                [0.886, -0.114, -0.114],
+                [-0.587, 0.413, -0.587],
+                [-0.299, -0.299, 0.701]]) * factor
+            + np.float32([[0.114], [0.587], [0.299]])
+    )
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+def contrast_func(img, factor):
+    """
+        same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = np.array([(
+        el - mean) * factor + mean
+        for el in range(256)
+    ]).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def brightness_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Contrast
+    '''
+    table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def sharpness_func(img, factor):
+    '''
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    '''
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def posterize_func(img, bits):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+### level to args
+def enhance_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+    return level_to_args
+def shear_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5: level = -level
+        return (level, replace_value)
+    return level_to_args
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5: level = -level
+        return (level, replace_value)
+    return level_to_args
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+    return level_to_args
+def solarize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level, )
+    return level_to_args
+def none_level_to_args(level):
+    return ()
+def posterize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level, )
+    return level_to_args
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+    return level_to_args
+func_dict = {
+    'Identity': identity_func,
+    'AutoContrast': autocontrast_func,
+    'Equalize': equalize_func,
+    'Rotate': rotate_func,
+    'Solarize': solarize_func,
+    'Color': color_func,
+    'Contrast': contrast_func,
+    'Brightness': brightness_func,
+    'Sharpness': sharpness_func,
+    'ShearX': shear_x_func,
+    'TranslateX': translate_x_func,
+    'TranslateY': translate_y_func,
+    'Posterize': posterize_func,
+    'ShearY': shear_y_func,
+}
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    'Identity': none_level_to_args,
+    'AutoContrast': none_level_to_args,
+    'Equalize': none_level_to_args,
+    'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value),
+    'Solarize': solarize_level_to_args(MAX_LEVEL),
+    'Color': enhance_level_to_args(MAX_LEVEL),
+    'Contrast': enhance_level_to_args(MAX_LEVEL),
+    'Brightness': enhance_level_to_args(MAX_LEVEL),
+    'Sharpness': enhance_level_to_args(MAX_LEVEL),
+    'ShearX': shear_level_to_args(MAX_LEVEL, replace_value),
+    'TranslateX': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'TranslateY': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'Posterize': posterize_level_to_args(MAX_LEVEL),
+    'ShearY': shear_level_to_args(MAX_LEVEL, replace_value),
+}
+class RandomAugment(object):
+    def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return img
+if __name__ == '__main__':
+    a = RandomAugment()
+    img = np.random.randn(32, 32, 3)
+    a(img)

model/unilm/beit3/requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+torch
+torchvision
+timm==0.4.12
+Pillow
+blobfile
+mypy
+numpy
+pytest
+requests
+einops
+tensorboardX
+scipy
+ftfy
+opencv-python
+sentencepiece
+pyarrow
+torchmetrics==0.7.3
+transformers
+deepspeed==0.4.0
+pycocotools
+pycocoevalcap
+torchscale==0.2.0

model/unilm/beit3/run_beit3_finetuning.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import argparse
+import datetime
+import numpy as np
+import time
+import torch
+import torch.backends.cudnn as cudnn
+import json
+import os
+from pathlib import Path
+from timm.data.mixup import Mixup
+from timm.models import create_model
+from timm.utils import ModelEma
+from optim_factory import create_optimizer, get_parameter_groups, \
+    LayerDecayValueAssigner, get_is_head_flag_for_vit
+from engine_for_finetuning import train_one_epoch, get_handler, evaluate
+from datasets import create_downstream_dataset
+from utils import NativeScalerWithGradNormCount as NativeScaler
+import utils
+import modeling_finetune
+def get_args():
+    parser = argparse.ArgumentParser('BEiT fine-tuning and evaluation script for image classification', add_help=False)
+    # Model parameters
+    parser.add_argument('--model', default='beit_base_patch16_224', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--task', type=str, required=True,
+                        choices=['nlvr2', 'vqav2', 'flickr30k', 'coco_retrieval', 'coco_captioning', 'nocaps', 'imagenet'],
+                        help='Name of task to fine-tuning')
+    parser.add_argument('--input_size', default=224, type=int,
+                        help='images input size')
+    parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT',
+                        help='Drop path rate (default: 0.1)')
+    parser.add_argument('--checkpoint_activations', action='store_true', default=None,
+                        help='Enable checkpointing to save your memory.')
+    parser.add_argument('--sentencepiece_model', type=str, required=True,
+                        help='Sentencepiece model path for the pretrained model.')
+    parser.add_argument('--vocab_size', type=int, default=64010)
+    parser.add_argument('--num_max_bpe_tokens', type=int, default=64)
+    parser.add_argument('--model_ema', action='store_true', default=False)
+    parser.add_argument('--model_ema_decay', type=float, default=0.9999, help='')
+    parser.add_argument('--model_ema_force_cpu', action='store_true', default=False, help='')
+    # Optimizer parameters
+    parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "adamw"')
+    parser.add_argument('--opt_eps', default=1e-8, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: 1e-8)')
+    parser.add_argument('--opt_betas', default=[0.9, 0.999], type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: 0.9, 0.999, use opt default)')
+    parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='SGD momentum (default: 0.9)')
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                        help='learning rate (default: 5e-4)')
+    parser.add_argument('--layer_decay', type=float, default=0.9)
+    parser.add_argument('--task_head_lr_weight', type=float, default=0)
+    parser.add_argument('--warmup_lr', type=float, default=1e-6, metavar='LR',
+                        help='warmup learning rate (default: 1e-6)')
+    parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-6)')
+    parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--warmup_steps', type=int, default=-1, metavar='N',
+                        help='num of steps to warmup LR, will overload warmup_epochs if set > 0')
+    parser.add_argument('--batch_size', default=64, type=int)
+    parser.add_argument('--eval_batch_size', default=None, type=int)
+    parser.add_argument('--epochs', default=20, type=int)
+    parser.add_argument('--update_freq', default=1, type=int)
+    parser.add_argument('--save_ckpt_freq', default=5, type=int)
+    # Augmentation parameters
+    parser.add_argument('--randaug', action='store_true', default=False)
+    parser.add_argument('--train_interpolation', type=str, default='bicubic',
+                        help='Training interpolation (random, bilinear, bicubic default: "bicubic")')
+    # Finetuning params
+    parser.add_argument('--finetune', default='',
+                        help='finetune from checkpoint')
+    parser.add_argument('--model_key', default='model|module', type=str)
+    parser.add_argument('--model_prefix', default='', type=str)
+    # Dataset parameters
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--output_dir', default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--log_dir', default=None,
+                        help='path where to tensorboard log')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='',
+                        help='resume from checkpoint')
+    parser.add_argument('--auto_resume', action='store_true')
+    parser.add_argument('--no_auto_resume', action='store_false', dest='auto_resume')
+    parser.set_defaults(auto_resume=True)
+    parser.add_argument('--save_ckpt', action='store_true')
+    parser.add_argument('--no_save_ckpt', action='store_false', dest='save_ckpt')
+    parser.set_defaults(save_ckpt=True)
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true',
+                        help='Perform evaluation only')
+    parser.add_argument('--dist_eval', action='store_true', default=False,
+                        help='Enabling distributed evaluation')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    # parameter for dump predictions (VQA, COCO captioning, NoCaps)
+    parser.add_argument('--task_cache_path', default=None, type=str)
+    # parameter for imagenet finetuning
+    parser.add_argument('--nb_classes', default=1000, type=int,
+                        help='number of the classification types')
+    parser.add_argument('--mixup', type=float, default=0,
+                        help='mixup alpha, mixup enabled if > 0.')
+    parser.add_argument('--cutmix', type=float, default=0,
+                        help='cutmix alpha, cutmix enabled if > 0.')
+    parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup_prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup_switch_prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup_mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+    # augmentation parameters for imagenet finetuning
+    parser.add_argument('--color_jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)')
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    # evaluation parameters for imagenet
+    parser.add_argument('--crop_pct', type=float, default=None)
+    # random Erase params for imagenet finetuning
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                        help='Random erase prob (default: 0.25)')
+    parser.add_argument('--remode', type=str, default='pixel',
+                        help='Random erase mode (default: "pixel")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+    # parameter for captioning finetuning
+    parser.add_argument('--captioning_mask_prob', type=float, default=0.6)
+    parser.add_argument('--drop_worst_ratio', type=float, default=0.2)
+    parser.add_argument('--drop_worst_after', type=int, default=12000)
+    parser.add_argument('--num_beams', type=int, default=3)
+    parser.add_argument('--length_penalty', type=float, default=0.6)
+    # label smoothing for imagenet and captioning
+    parser.add_argument('--label_smoothing', type=float, default=0.1)
+    # deepspeed parameters
+    parser.add_argument('--enable_deepspeed', action='store_true', default=False)
+    parser.add_argument('--initial_scale_power', type=int, default=16)
+    parser.add_argument('--zero_stage', default=0, type=int,
+                        help='ZeRO optimizer stage (default: 0)')
+    known_args, _ = parser.parse_known_args()
+    if known_args.enable_deepspeed:
+        try:
+            import deepspeed
+            from deepspeed import DeepSpeedConfig
+            parser = deepspeed.add_config_arguments(parser)
+            ds_init = deepspeed.initialize
+        except:
+            print("Please 'pip install deepspeed==0.4.0'")
+            exit(0)
+    else:
+        ds_init = None
+    return parser.parse_args(), ds_init
+def main(args, ds_init):
+    utils.init_distributed_mode(args)
+    if ds_init is not None:
+        utils.create_ds_config(args)
+    if args.task_cache_path is None:
+        args.task_cache_path = args.output_dir
+    print(args)
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # random.seed(seed)
+    cudnn.benchmark = True
+    if utils.get_rank() == 0 and args.log_dir is not None:
+        os.makedirs(args.log_dir, exist_ok=True)
+        log_writer = utils.TensorboardLogger(log_dir=args.log_dir)
+    else:
+        log_writer = None
+    data_loader_train, data_loader_val = create_downstream_dataset(args)
+    if not args.model.endswith(args.task):
+        if args.task in ("flickr30k", "coco_retrieval"):
+            model_config = "%s_retrieval" % args.model
+        elif args.task in ("coco_captioning", "nocaps"):
+            model_config = "%s_captioning" % args.model
+        elif args.task in ("imagenet"):
+            model_config = "%s_imageclassification" % args.model
+        else:
+            model_config = "%s_%s" % (args.model, args.task)
+    else:
+        model_config = args.model
+    print("model_config = %s" % model_config)
+    model = create_model(
+        model_config,
+        pretrained=False,
+        drop_path_rate=args.drop_path,
+        vocab_size=args.vocab_size,
+        checkpoint_activations=args.checkpoint_activations,
+    )
+    if args.finetune:
+        utils.load_model_and_may_interpolate(args.finetune, model, args.model_key, args.model_prefix)
+    model.to(device)
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume='')
+        print("Using EMA with decay = %.8f" % args.model_ema_decay)
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("Model = %s" % str(model_without_ddp))
+    print('number of params:', n_parameters)
+    total_batch_size = args.batch_size * args.update_freq * utils.get_world_size()
+    num_training_steps_per_epoch = len(data_loader_train.dataset) // total_batch_size
+    print("LR = %.8f" % args.lr)
+    print("Batch size = %d" % total_batch_size)
+    print("Update frequent = %d" % args.update_freq)
+    print("Number of training examples = %d" % len(data_loader_train.dataset))
+    print("Number of training training per epoch = %d" % num_training_steps_per_epoch)
+    num_layers = model_without_ddp.get_num_layers()
+    if args.layer_decay < 1.0:
+        lrs = list(args.layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))
+        assigner = LayerDecayValueAssigner(lrs)
+    elif args.task_head_lr_weight > 1:
+        assigner = LayerDecayValueAssigner([1.0, args.task_head_lr_weight], scale_handler=get_is_head_flag_for_vit)
+    else:
+        assigner = None
+    if assigner is not None:
+        print("Assigned values = %s" % str(assigner.values))
+    skip_weight_decay_list = model.no_weight_decay()
+    if args.distributed:
+        torch.distributed.barrier()
+    if args.enable_deepspeed:
+        loss_scaler = None
+        optimizer_params = get_parameter_groups(
+            model, args.weight_decay, skip_weight_decay_list,
+            assigner.get_layer_id if assigner is not None else None,
+            assigner.get_scale if assigner is not None else None)
+        model, optimizer, _, _ = ds_init(
+            args=args, model=model, model_parameters=optimizer_params,
+            dist_init_required=not args.distributed,
+        )
+        print("model.gradient_accumulation_steps() = %d" % model.gradient_accumulation_steps())
+        assert model.gradient_accumulation_steps() == args.update_freq
+    else:
+        if args.distributed:
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+            model_without_ddp = model.module
+        optimizer = create_optimizer(
+            args, model_without_ddp, skip_list=skip_weight_decay_list,
+            get_num_layer=assigner.get_layer_id if assigner is not None else None,
+            get_layer_scale=assigner.get_scale if assigner is not None else None)
+        loss_scaler = NativeScaler()
+    lr_schedule_values = utils.cosine_scheduler(
+        args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch,
+        warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps,
+    )
+    utils.auto_load_model(
+        args=args, model=model, model_without_ddp=model_without_ddp,
+        optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema)
+    task_handler = get_handler(args)
+    # mixup for imagenet
+    mixup_fn = None
+    if args.task in ["imagenet", "in1k"]:
+        mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+        if mixup_active:
+            print("Mixup is activated!")
+            mixup_fn = Mixup(
+                mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+                prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+                label_smoothing=args.label_smoothing, num_classes=args.nb_classes)
+    if args.eval:
+        data_loader_test = create_downstream_dataset(args, is_eval=True)
+        if args.task in ["nlvr2", "flickr30k", "coco_retrieval", "imagenet"]:
+            ext_test_stats, task_key = evaluate(data_loader_test, model, device, task_handler)
+            print(f"Accuracy of the network on the {len(data_loader_test.dataset)} test images: {ext_test_stats[task_key]:.3f}%")
+            exit(0)
+        elif args.task == "vqav2":
+            result, _ = evaluate(data_loader_test, model, device, task_handler)
+            utils.dump_predictions(args, result, "vqav2_test")
+            exit(0)
+        elif args.task in ["coco_captioning", "nocaps"]:
+            predictions, _ = evaluate(data_loader_test, model, device, task_handler)
+            prediction_file = utils.dump_predictions(args, predictions, "{}_test".format(args.task))
+            if utils.is_main_process() and args.task == "coco_captioning":
+                captioning_result = utils.coco_caption_eval(args.output_dir, prediction_file, "{}_test".format(args.task))
+                result_file = os.path.join(args.output_dir, f"{args.task}_result.json")
+                print(json.dumps(captioning_result))
+                utils.write_result_to_jsonl(captioning_result, result_file)
+            exit(0)
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+        if log_writer is not None:
+            log_writer.set_step(epoch * num_training_steps_per_epoch * args.update_freq)
+        train_stats = train_one_epoch(
+            model, data_loader_train, optimizer, device, task_handler, epoch,
+            epoch * num_training_steps_per_epoch, lr_schedule_values, loss_scaler,
+            args.clip_grad, args.update_freq, model_ema, log_writer, args.task, mixup_fn,
+        )
+        if args.output_dir and args.save_ckpt:
+            if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs:
+                utils.save_model(
+                    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                    loss_scaler=loss_scaler, epoch=epoch, model_ema=model_ema)
+        if data_loader_val is not None:
+            if args.task not in ["coco_captioning", "nocaps"]:
+                test_stats, task_key = evaluate(data_loader_val, model, device, task_handler)
+            else:
+                predictions, _ = evaluate(data_loader_val, model, device, task_handler)
+                prediction_file = utils.dump_predictions(args, predictions, f"{args.task}_val_e{epoch}")
+                result_file = os.path.join(args.output_dir, f"{args.task}_result_val_e{epoch}.json")
+                task_key = "CIDEr"
+                if utils.is_main_process():
+                    test_stats = utils.coco_caption_eval(args.output_dir, prediction_file, "{}_val".format(args.task))
+                    utils.write_result_to_jsonl(test_stats, result_file)
+                torch.distributed.barrier()
+                if not utils.is_main_process():
+                    test_stats = utils.read_result_from_jsonl(result_file)
+            print(f"Performance of the network on the {len(data_loader_val.dataset)} val images: {test_stats[task_key]:.1f}%")
+            if max_accuracy < test_stats[task_key]:
+                max_accuracy = test_stats[task_key]
+                if args.output_dir and args.save_ckpt:
+                    utils.save_model(
+                        args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                        loss_scaler=loss_scaler, epoch="best", model_ema=model_ema)
+            print(f'Max performance: {max_accuracy:.2f}%')
+            if log_writer is not None:
+                log_writer.update(acc=test_stats[task_key], head="perf", step=epoch)
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        **{f'val_{k}': v for k, v in test_stats.items()},
+                        'epoch': epoch,
+                        'n_parameters': n_parameters}
+        else:
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         # **{f'test_{k}': v for k, v in test_stats.items()},
+                         'epoch': epoch,
+                         'n_parameters': n_parameters}
+        if args.output_dir and utils.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+if __name__ == '__main__':
+    opts, ds_init = get_args()
+    if opts.output_dir:
+        Path(opts.output_dir).mkdir(parents=True, exist_ok=True)
+    main(opts, ds_init)

model/unilm/beit3/utils.py ADDED Viewed

	@@ -0,0 +1,913 @@

+# --------------------------------------------------------
+# Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks (https://arxiv.org/abs/2208.10442)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit3
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------'
+import datetime
+import io
+import os
+import math
+import time
+import json
+import argparse
+import numpy as np
+from pathlib import Path
+from collections import defaultdict, deque
+from timm.utils import get_state_dict
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch._six import inf
+from torchmetrics import Metric
+from tensorboardX import SummaryWriter
+def bool_flag(s):
+    """
+    Parse boolean arguments from the command line.
+    """
+    FALSY_STRINGS = {"off", "false", "0"}
+    TRUTHY_STRINGS = {"on", "true", "1"}
+    if s.lower() in FALSY_STRINGS:
+        return False
+    elif s.lower() in TRUTHY_STRINGS:
+        return True
+    else:
+        raise argparse.ArgumentTypeError("invalid value for a boolean flag")
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+class TensorboardLogger(object):
+    def __init__(self, log_dir):
+        self.writer = SummaryWriter(logdir=log_dir)
+        self.step = 0
+    def set_step(self, step=None):
+        if step is not None:
+            self.step = step
+        else:
+            self.step += 1
+    def update(self, head='scalar', step=None, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.writer.add_scalar(head + "/" + k, v, self.step if step is None else step)
+    def flush(self):
+        self.writer.flush()
+def _load_checkpoint_for_ema(model_ema, checkpoint):
+    """
+    Workaround for ModelEma._load_checkpoint to accept an already-loaded object
+    """
+    mem_file = io.BytesIO()
+    torch.save(checkpoint, mem_file)
+    mem_file.seek(0)
+    model_ema._load_checkpoint(mem_file)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def _get_rank_env():
+    if "RANK" in os.environ:
+        return int(os.environ["RANK"])
+    else:
+        return int(os.environ['OMPI_COMM_WORLD_RANK'])
+def _get_local_rank_env():
+    if "LOCAL_RANK" in os.environ:
+        return int(os.environ["LOCAL_RANK"])
+    else:
+        return int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+def _get_world_size_env():
+    if "WORLD_SIZE" in os.environ:
+        return int(os.environ["WORLD_SIZE"])
+    else:
+        return int(os.environ['OMPI_COMM_WORLD_SIZE'])
+# The implementation code is modified from DeiT (https://github.com/facebookresearch/deit.git)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = _get_rank_env()
+        args.world_size = _get_world_size_env()  # int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = _get_local_rank_env()
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, init_method=args.dist_url,
+        world_size=args.world_size, rank=args.rank,
+        timeout=datetime.timedelta(0, 7200)
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def load_state_dict(model, state_dict, prefix='', ignore_missing="relative_position_index"):
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    def load(module, prefix=''):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(
+            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(model, prefix=prefix)
+    warn_missing_keys = []
+    ignore_missing_keys = []
+    for key in missing_keys:
+        keep_flag = True
+        for ignore_key in ignore_missing.split('|'):
+            if ignore_key in key:
+                keep_flag = False
+                break
+        if keep_flag:
+            warn_missing_keys.append(key)
+        else:
+            ignore_missing_keys.append(key)
+    missing_keys = warn_missing_keys
+    if len(missing_keys) > 0:
+        print("Weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        print("Weights from pretrained model not used in {}: {}".format(
+            model.__class__.__name__, unexpected_keys))
+    if len(ignore_missing_keys) > 0:
+        print("Ignored weights of {} not initialized from pretrained model: {}".format(
+            model.__class__.__name__, ignore_missing_keys))
+    if len(error_msgs) > 0:
+        print('\n'.join(error_msgs))
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+                     start_warmup_value=0, warmup_steps=-1, sched_type="cos"):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+    if sched_type == "cos":
+        iters = np.arange(epochs * niter_per_ep - warmup_iters)
+        schedule = np.array([
+            final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+    elif sched_type == "linear":
+        schedule = np.linspace(base_value, final_value, epochs * niter_per_ep - warmup_iters)
+    else:
+        raise NotImplementedError()
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, model_ema=None):
+    output_dir = Path(args.output_dir)
+    if loss_scaler is not None:
+        checkpoint_paths = [output_dir / ('checkpoint-%s.pth' % epoch)]
+        for checkpoint_path in checkpoint_paths:
+            to_save = {
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'epoch': epoch,
+                'scaler': loss_scaler.state_dict(),
+                'args': args,
+            }
+            if model_ema is not None:
+                to_save['model_ema'] = get_state_dict(model_ema)
+            save_on_master(to_save, checkpoint_path)
+    else:
+        client_state = {'epoch': epoch, "args": args}
+        if model_ema is not None:
+            client_state['model_ema'] = get_state_dict(model_ema)
+        model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch, client_state=client_state)
+def auto_load_model(args, model, model_without_ddp, optimizer, loss_scaler, model_ema=None):
+    output_dir = Path(args.output_dir)
+    if loss_scaler is not None:
+        # torch.amp
+        if args.auto_resume and len(args.resume) == 0:
+            import glob
+            all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*.pth'))
+            latest_ckpt = -1
+            for ckpt in all_checkpoints:
+                t = ckpt.split('-')[-1].split('.')[0]
+                if t.isdigit():
+                    latest_ckpt = max(int(t), latest_ckpt)
+            if latest_ckpt >= 0:
+                args.resume = os.path.join(output_dir, 'checkpoint-%d.pth' % latest_ckpt)
+            print("Auto resume checkpoint: %s" % args.resume)
+        if args.resume:
+            if args.resume.startswith('https'):
+                checkpoint = torch.hub.load_state_dict_from_url(
+                    args.resume, map_location='cpu', check_hash=True)
+            else:
+                checkpoint = torch.load(args.resume, map_location='cpu')
+            model_without_ddp.load_state_dict(checkpoint['model'])
+            print("Resume checkpoint %s" % args.resume)
+            if 'optimizer' in checkpoint and 'epoch' in checkpoint:
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                args.start_epoch = checkpoint['epoch'] + 1
+                if hasattr(args, 'model_ema') and args.model_ema:
+                    _load_checkpoint_for_ema(model_ema, checkpoint['model_ema'])
+                if 'scaler' in checkpoint:
+                    loss_scaler.load_state_dict(checkpoint['scaler'])
+                print("With optim & sched!")
+    else:
+        # deepspeed, only support '--auto_resume'.
+        if args.auto_resume:
+            import glob
+            all_checkpoints = glob.glob(os.path.join(output_dir, 'checkpoint-*'))
+            latest_ckpt = -1
+            for ckpt in all_checkpoints:
+                t = ckpt.split('-')[-1].split('.')[0]
+                if t.isdigit():
+                    latest_ckpt = max(int(t), latest_ckpt)
+            if latest_ckpt >= 0:
+                args.resume = os.path.join(output_dir, 'checkpoint-%d' % latest_ckpt)
+                print("Auto resume checkpoint: %d" % latest_ckpt)
+                _, client_states = model.load_checkpoint(args.output_dir, tag='checkpoint-%d' % latest_ckpt)
+                args.start_epoch = client_states['epoch'] + 1
+                if model_ema is not None:
+                    if args.model_ema:
+                        _load_checkpoint_for_ema(model_ema, client_states['model_ema'])
+# The implementation code is modified from DeiT (https://github.com/facebookresearch/deit.git)
+def load_model_and_may_interpolate(ckpt_path, model, model_key, model_prefix):
+    if ckpt_path.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(
+            ckpt_path, map_location='cpu', check_hash=True)
+    else:
+        checkpoint = torch.load(ckpt_path, map_location='cpu')
+    print("Load ckpt from %s" % ckpt_path)
+    checkpoint_model = None
+    for model_key in model_key.split('|'):
+        if model_key in checkpoint:
+            checkpoint_model = checkpoint[model_key]
+            print("Load state_dict by model_key = %s" % model_key)
+            break
+    if checkpoint_model is None:
+        checkpoint_model = checkpoint
+    state_dict = model.state_dict()
+    for k in ['head.weight', 'head.bias']:
+        if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+            print(f"Removing key {k} from pretrained checkpoint")
+            del checkpoint_model[k]
+    # interpolate position embedding
+    for pos_embed_key in ("vision_pos_embed", "pos_embed", "beit3.encoder.embed_positions.A.weight"):
+        if pos_embed_key in checkpoint_model:
+            pos_embed_checkpoint = checkpoint_model[pos_embed_key]
+            embedding_size = pos_embed_checkpoint.shape[-1]
+            if pos_embed_key == "beit3.encoder.embed_positions.A.weight":
+                # being consistent with Fairseq, which starts from 2 for position embedding
+                torchscale_model = True
+                num_patches = model.beit3.vision_embed.num_patches
+                num_extra_tokens = model.beit3.vision_embed.num_position_embeddings() + 2 - num_patches
+            else:
+                torchscale_model = False
+                num_patches = model.patch_embed.num_patches
+                num_extra_tokens = getattr(model, pos_embed_key).shape[-2] - num_patches
+            # height (== width) for the checkpoint position embedding
+            orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+            # height (== width) for the new position embedding
+            new_size = int(num_patches ** 0.5)
+            # class_token and dist_token are kept unchanged
+            if orig_size != new_size:
+                print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+                if torchscale_model:
+                    extra_tokens = pos_embed_checkpoint[:num_extra_tokens].unsqueeze(0)
+                    # only the position tokens are interpolated
+                    pos_tokens = pos_embed_checkpoint[num_extra_tokens:]
+                else:
+                    extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+                    # only the position tokens are interpolated
+                    pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+                pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+                pos_tokens = torch.nn.functional.interpolate(
+                    pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+                pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+                new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+                if torchscale_model:
+                    new_pos_embed = new_pos_embed.squeeze(0)
+                checkpoint_model[pos_embed_key] = new_pos_embed
+    load_state_dict(model, checkpoint_model, prefix=model_prefix)
+def create_ds_config(args):
+    args.deepspeed_config = os.path.join(args.output_dir, "deepspeed_config.json")
+    with open(args.deepspeed_config, mode="w") as writer:
+        ds_config = {
+            "train_batch_size": args.batch_size * args.update_freq * get_world_size(),
+            "train_micro_batch_size_per_gpu": args.batch_size,
+            "steps_per_print": 1000,
+            "optimizer": {
+                "type": "Adam",
+                "adam_w_mode": True,
+                "params": {
+                    "lr": args.lr,
+                    "weight_decay": args.weight_decay,
+                    "bias_correction": True,
+                    "betas": [
+                        args.opt_betas[0],
+                        args.opt_betas[1]
+                    ],
+                    "eps": args.opt_eps
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0,
+                "initial_scale_power": getattr(args, "initial_scale_power", 12),
+                "loss_scale_window": 1000,
+                "hysteresis": 2,
+                "min_loss_scale": 1
+            },
+            "amp": {
+                "enabled": False,
+                "opt_level": "O2"
+            }
+        }
+        if args.clip_grad is not None:
+            ds_config.update({'gradient_clipping': args.clip_grad})
+        if args.zero_stage == 1:
+            ds_config.update({"zero_optimization": {"stage": args.zero_stage, "reduce_bucket_size": 5e8}})
+        elif args.zero_stage > 1:
+            raise NotImplementedError()
+        writer.write(json.dumps(ds_config, indent=2))
+def merge_batch_tensors_by_dict_key(batch):
+    batch_tensors = {}
+    for tensor_key in batch[0]:
+        if isinstance(batch[0][tensor_key], torch.Tensor):
+            batch_tensors[tensor_key] = torch.stack([d[tensor_key] for d in batch])
+        else:
+            batch_tensors[tensor_key] = torch.tensor([d[tensor_key] for d in batch], dtype=torch.long)
+    return batch_tensors
+def get_loss_scale_for_deepspeed(model):
+    optimizer = model.optimizer
+    loss_scale = None
+    if hasattr(optimizer, 'loss_scale'):
+        loss_scale = optimizer.loss_scale
+    elif hasattr(optimizer, 'cur_scale'):
+        loss_scale = optimizer.cur_scale
+    return loss_scale
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+    @staticmethod
+    def forward(ctx, x):
+        output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, x)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        dist.all_reduce(all_gradients)
+        return all_gradients[dist.get_rank()]
+def gather_features(
+        image_features,
+        text_features,
+):
+    gathered_image_features = GatherLayer.apply(image_features)
+    gathered_text_features = GatherLayer.apply(text_features)
+    all_image_features = torch.cat(gathered_image_features)
+    all_text_features = torch.cat(gathered_text_features)
+    return all_image_features, all_text_features
+# The implementation code is modified from open_clip (https://github.com/mlfoundations/open_clip.git)
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def forward(self, image_features, text_features, logit_scale):
+        device = image_features.device
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features, text_features
+            )
+            logits_per_image = logit_scale * image_features @ all_text_features.T
+            logits_per_text = logit_scale * text_features @ all_image_features.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        # calculated ground-truth and cache if enabled
+        num_logits = logits_per_image.shape[0]
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+            ) / 2
+        return total_loss, logits_per_image, logits_per_text
+def write_result_to_jsonl(test_stats, result_file):
+    with open(result_file, mode="w", encoding="utf-8") as writer:
+        writer.write(json.dumps(test_stats, indent=None))
+def read_result_from_jsonl(result_file):
+    with open(result_file, mode="r", encoding="utf-8") as reader:
+        return json.load(reader)
+# The implementation code is from ViLT (https://github.com/dandelin/ViLT.git)
+class VQAScore(Metric):
+    def __init__(self, dist_sync_on_step=False):
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state("score", default=torch.tensor(0.0), dist_reduce_fx="sum")
+        self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum")
+    def update(self, logits, target):
+        logits, target = (
+            logits.detach().float().to(self.score.device),
+            target.detach().float().to(self.score.device),
+        )
+        logits = torch.max(logits, 1)[1]
+        one_hots = torch.zeros(*target.size()).to(target)
+        one_hots.scatter_(1, logits.view(-1, 1), 1)
+        scores = one_hots * target
+        self.score += scores.sum()
+        self.total += len(logits)
+    def compute(self):
+        return self.score / self.total
+class BertCaptioningLoss(nn.Module):
+    def __init__(self, label_smoothing, drop_worst_ratio, drop_worst_after):
+        super().__init__()
+        self.label_smoothing = label_smoothing
+        self.drop_worst_ratio = drop_worst_ratio
+        self.drop_worst_after = drop_worst_after
+        self.log_soft = nn.LogSoftmax(dim=1)
+        self.kl = nn.KLDivLoss(reduction='none')
+        self.iter = 0
+    def forward(self, logits, target, iter):
+        eps = self.label_smoothing
+        n_class = logits.size(1)
+        one_hot = torch.zeros_like(logits).scatter(1, target.view(-1, 1), 1)
+        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
+        log_prb = self.log_soft(logits)
+        loss = self.kl(log_prb, one_hot).sum(1)
+        if self.drop_worst_ratio > 0 and iter > self.drop_worst_after:
+            loss, _ = torch.topk(loss,
+                    k=int(loss.shape[0] * (1-self.drop_worst_ratio)),
+                    largest=False)
+        loss = loss.mean()
+        return loss
+class BeamHypotheses(object):
+    def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.n_hyp = n_hyp
+        self.hyp = []
+        self.worst_score = 1e9
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.hyp)
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.n_hyp or score > self.worst_score:
+            self.hyp.append((score, hyp))
+            if len(self) > self.n_hyp:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
+                del self.hyp[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+    def is_done(self, best_sum_logprobs):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.n_hyp:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty
+def dump_predictions(args, result, file_suffix):
+    global_rank = get_rank()
+    jsons = None
+    if global_rank >= 0:
+        output_file = os.path.join(args.task_cache_path, f"submit_{global_rank}_{file_suffix}.json")
+        with open(output_file, "w") as fp:
+            json.dump(result, fp, indent=2)
+        torch.distributed.barrier()
+        if global_rank == 0:
+            world_size = get_world_size()
+            jsons = []
+            for i in range(world_size):
+                each_file = os.path.join(args.task_cache_path, f"submit_{i}_{file_suffix}.json")
+                with open(each_file, "r") as fp:
+                    jsons += json.load(fp)
+            new_jsons = []
+            res_dict = dict()
+            if args.task in ["coco_captioning", "nocaps"]:
+                qid_key = "image_id"
+            else:
+                # for VQAv2
+                qid_key = "question_id"
+            for item in jsons:
+                if item[qid_key] in res_dict:
+                    continue
+                new_jsons.append(item)
+                res_dict[item[qid_key]] = item
+            jsons = new_jsons
+        torch.distributed.barrier()
+        os.remove(output_file)
+    else:
+        jsons = result
+    result_file = os.path.join(args.output_dir, f"submit_{file_suffix}.json")
+    if jsons is not None:
+        with open(result_file, "w") as fp:
+            json.dump(jsons, fp, indent=2)
+        print("Infer %d examples into %s" % (len(jsons), result_file))
+    return result_file
+# The evaluation code is from BLIP (https://github.com/salesforce/BLIP)
+# For nocaps, please submit the prediction file to the evaluate server (https://eval.ai/web/challenges/challenge-page/355/overview) to obtain the final results
+def coco_caption_eval(gt_dir, results_file, split):
+    from pycocotools.coco import COCO
+    from pycocoevalcap.eval import COCOEvalCap
+    from torchvision.datasets.utils import download_url
+    urls = {'coco_captioning_val': 'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json',
+            'coco_captioning_test': 'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json',
+            'nocaps_val': 'https://github.com/addf400/files/releases/download/beit3/nocaps_val_gt.json'}
+    filenames = {'coco_captioning_val':'coco_karpathy_val_gt.json',
+                 'coco_captioning_test':'coco_karpathy_test_gt.json',
+                 'nocaps_val':'nocaps_val_gt.json'}
+    download_url(urls[split], gt_dir)
+    annotation_file = os.path.join(gt_dir, filenames[split])
+    # create coco object and coco_result object
+    coco = COCO(annotation_file)
+    coco_result = coco.loadRes(results_file)
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+    # evaluate results
+    # SPICE will take a few minutes the first time, but speeds up due to caching
+    coco_eval.evaluate()
+    res_dict = dict()
+    for metric, score in coco_eval.eval.items():
+        res_dict[metric] = score
+    return res_dict

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+packaging
+sentencepiece
+einops==0.4.1
+fastapi==0.100.1
+markdown2==2.4.10
+numpy==1.24.2
+openai==0.27.8
+opencv_python==4.8.0.74
+Pillow==9.4.0
+pycocotools==2.0.6
+ray==2.6.1
+Requests==2.31.0
+shortuuid==1.0.11
+tqdm==4.64.1
+transformers==4.31.0
+uvicorn==0.23.2
+scipy==1.11.2
+bitsandbytes==0.41.1
+timm==0.4.12
+blobfile
+mypy
+pytest
+requests
+tensorboardX
+ftfy
+opencv-python
+pyarrow
+torchmetrics==0.7.3
+deepspeed
+pycocoevalcap
+torchscale==0.2.0
+gradio

utils/ade20k_classes.json ADDED Viewed

	@@ -0,0 +1,30 @@

+[
+    "wall", "building", "sky", "floor", "tree", "ceiling", "road",
+    "bed", "windowpane", "grass", "cabinet", "sidewalk",
+    "person", "earth", "door", "table", "mountain", "plant",
+    "curtain", "chair", "car", "water", "painting", "sofa",
+    "shelf", "house", "sea", "mirror", "rug", "field", "armchair",
+    "seat", "fence", "desk", "rock", "wardrobe", "lamp",
+    "bathtub", "railing", "cushion", "base", "box", "column",
+    "signboard", "chest of drawers", "counter", "sand", "sink",
+    "skyscraper", "fireplace", "refrigerator", "grandstand",
+    "path", "stairs", "runway", "case", "pool table", "pillow",
+    "screen door", "stairway", "river", "bridge", "bookcase",
+    "blind", "coffee table", "toilet", "flower", "book", "hill",
+    "bench", "countertop", "stove", "palm", "kitchen island",
+    "computer", "swivel chair", "boat", "bar", "arcade machine",
+    "hovel", "bus", "towel", "light", "truck", "tower",
+    "chandelier", "awning", "streetlight", "booth",
+    "television receiver", "airplane", "dirt track", "apparel",
+    "pole", "land", "bannister", "escalator", "ottoman", "bottle",
+    "buffet", "poster", "stage", "van", "ship", "fountain",
+    "conveyer belt", "canopy", "washer", "plaything",
+    "swimming pool", "stool", "barrel", "basket", "waterfall",
+    "tent", "bag", "minibike", "cradle", "oven", "ball", "food",
+    "step", "tank", "trade name", "microwave", "pot", "animal",
+    "bicycle", "lake", "dishwasher", "screen", "blanket",
+    "sculpture", "hood", "sconce", "vase", "traffic light",
+    "tray", "ashcan", "fan", "pier", "crt screen", "plate",
+    "monitor", "bulletin board", "shower", "radiator", "glass",
+    "clock", "flag"
+]

utils/aug.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from copy import deepcopy
+from typing import Tuple
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize  # type: ignore
+from torchvision.transforms.functional import to_pil_image
+import random
+class RandomScale:
+    """
+    Resizes images to the longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+    def __init__(self, max_length: int, min_length: int) -> None:
+        self.max_length = max_length
+        self.min_length = min_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.max_length, self.min_length
+        )
+        return np.array(resize(to_pil_image(image), target_size))
+    def apply_coords(
+        self, coords: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.max_length, self.min_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes(
+        self, boxes: np.ndarray, original_size: Tuple[int, ...]
+    ) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(
+            image.shape[0], image.shape[1], self.max_length, self.min_length
+        )
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.max_length, self.min_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    @staticmethod
+    def get_preprocess_shape(
+        oldh: int, oldw: int, max_length: int, min_length: int
+    ) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        max_scale = max_length * 1.0 / max(oldh, oldw)
+        min_scale = min_length * 1.0 / max(oldh, oldw)
+        scale = min_scale + random.random() * (max_scale-min_scale)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)